1*6b445a62SJohn Marino /* mbutil.c -- readline multibyte character utility functions */
2*6b445a62SJohn Marino
3*6b445a62SJohn Marino /* Copyright (C) 2001-2009 Free Software Foundation, Inc.
4*6b445a62SJohn Marino
5*6b445a62SJohn Marino This file is part of the GNU Readline Library (Readline), a library
6*6b445a62SJohn Marino for reading lines of text with interactive input and history editing.
7*6b445a62SJohn Marino
8*6b445a62SJohn Marino Readline is free software: you can redistribute it and/or modify
9*6b445a62SJohn Marino it under the terms of the GNU General Public License as published by
10*6b445a62SJohn Marino the Free Software Foundation, either version 3 of the License, or
11*6b445a62SJohn Marino (at your option) any later version.
12*6b445a62SJohn Marino
13*6b445a62SJohn Marino Readline is distributed in the hope that it will be useful,
14*6b445a62SJohn Marino but WITHOUT ANY WARRANTY; without even the implied warranty of
15*6b445a62SJohn Marino MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
16*6b445a62SJohn Marino GNU General Public License for more details.
17*6b445a62SJohn Marino
18*6b445a62SJohn Marino You should have received a copy of the GNU General Public License
19*6b445a62SJohn Marino along with Readline. If not, see <http://www.gnu.org/licenses/>.
20*6b445a62SJohn Marino */
21*6b445a62SJohn Marino
22*6b445a62SJohn Marino #define READLINE_LIBRARY
23*6b445a62SJohn Marino
24*6b445a62SJohn Marino #if defined (HAVE_CONFIG_H)
25*6b445a62SJohn Marino # include <config.h>
26*6b445a62SJohn Marino #endif
27*6b445a62SJohn Marino
28*6b445a62SJohn Marino #include <sys/types.h>
29*6b445a62SJohn Marino #include <fcntl.h>
30*6b445a62SJohn Marino #include "posixjmp.h"
31*6b445a62SJohn Marino
32*6b445a62SJohn Marino #if defined (HAVE_UNISTD_H)
33*6b445a62SJohn Marino # include <unistd.h> /* for _POSIX_VERSION */
34*6b445a62SJohn Marino #endif /* HAVE_UNISTD_H */
35*6b445a62SJohn Marino
36*6b445a62SJohn Marino #if defined (HAVE_STDLIB_H)
37*6b445a62SJohn Marino # include <stdlib.h>
38*6b445a62SJohn Marino #else
39*6b445a62SJohn Marino # include "ansi_stdlib.h"
40*6b445a62SJohn Marino #endif /* HAVE_STDLIB_H */
41*6b445a62SJohn Marino
42*6b445a62SJohn Marino #include <stdio.h>
43*6b445a62SJohn Marino #include <ctype.h>
44*6b445a62SJohn Marino
45*6b445a62SJohn Marino /* System-specific feature definitions and include files. */
46*6b445a62SJohn Marino #include "rldefs.h"
47*6b445a62SJohn Marino #include "rlmbutil.h"
48*6b445a62SJohn Marino
49*6b445a62SJohn Marino #if defined (TIOCSTAT_IN_SYS_IOCTL)
50*6b445a62SJohn Marino # include <sys/ioctl.h>
51*6b445a62SJohn Marino #endif /* TIOCSTAT_IN_SYS_IOCTL */
52*6b445a62SJohn Marino
53*6b445a62SJohn Marino /* Some standard library routines. */
54*6b445a62SJohn Marino #include "readline.h"
55*6b445a62SJohn Marino
56*6b445a62SJohn Marino #include "rlprivate.h"
57*6b445a62SJohn Marino #include "xmalloc.h"
58*6b445a62SJohn Marino
59*6b445a62SJohn Marino /* Declared here so it can be shared between the readline and history
60*6b445a62SJohn Marino libraries. */
61*6b445a62SJohn Marino #if defined (HANDLE_MULTIBYTE)
62*6b445a62SJohn Marino int rl_byte_oriented = 0;
63*6b445a62SJohn Marino #else
64*6b445a62SJohn Marino int rl_byte_oriented = 1;
65*6b445a62SJohn Marino #endif
66*6b445a62SJohn Marino
67*6b445a62SJohn Marino /* **************************************************************** */
68*6b445a62SJohn Marino /* */
69*6b445a62SJohn Marino /* Multibyte Character Utility Functions */
70*6b445a62SJohn Marino /* */
71*6b445a62SJohn Marino /* **************************************************************** */
72*6b445a62SJohn Marino
73*6b445a62SJohn Marino #if defined(HANDLE_MULTIBYTE)
74*6b445a62SJohn Marino
75*6b445a62SJohn Marino static int
_rl_find_next_mbchar_internal(string,seed,count,find_non_zero)76*6b445a62SJohn Marino _rl_find_next_mbchar_internal (string, seed, count, find_non_zero)
77*6b445a62SJohn Marino char *string;
78*6b445a62SJohn Marino int seed, count, find_non_zero;
79*6b445a62SJohn Marino {
80*6b445a62SJohn Marino size_t tmp, len;
81*6b445a62SJohn Marino mbstate_t ps;
82*6b445a62SJohn Marino int point;
83*6b445a62SJohn Marino wchar_t wc;
84*6b445a62SJohn Marino
85*6b445a62SJohn Marino tmp = 0;
86*6b445a62SJohn Marino
87*6b445a62SJohn Marino memset(&ps, 0, sizeof (mbstate_t));
88*6b445a62SJohn Marino if (seed < 0)
89*6b445a62SJohn Marino seed = 0;
90*6b445a62SJohn Marino if (count <= 0)
91*6b445a62SJohn Marino return seed;
92*6b445a62SJohn Marino
93*6b445a62SJohn Marino point = seed + _rl_adjust_point (string, seed, &ps);
94*6b445a62SJohn Marino /* if this is true, means that seed was not pointing to a byte indicating
95*6b445a62SJohn Marino the beginning of a multibyte character. Correct the point and consume
96*6b445a62SJohn Marino one char. */
97*6b445a62SJohn Marino if (seed < point)
98*6b445a62SJohn Marino count--;
99*6b445a62SJohn Marino
100*6b445a62SJohn Marino while (count > 0)
101*6b445a62SJohn Marino {
102*6b445a62SJohn Marino len = strlen (string + point);
103*6b445a62SJohn Marino if (len == 0)
104*6b445a62SJohn Marino break;
105*6b445a62SJohn Marino tmp = mbrtowc (&wc, string+point, len, &ps);
106*6b445a62SJohn Marino if (MB_INVALIDCH ((size_t)tmp))
107*6b445a62SJohn Marino {
108*6b445a62SJohn Marino /* invalid bytes. assume a byte represents a character */
109*6b445a62SJohn Marino point++;
110*6b445a62SJohn Marino count--;
111*6b445a62SJohn Marino /* reset states. */
112*6b445a62SJohn Marino memset(&ps, 0, sizeof(mbstate_t));
113*6b445a62SJohn Marino }
114*6b445a62SJohn Marino else if (MB_NULLWCH (tmp))
115*6b445a62SJohn Marino break; /* found wide '\0' */
116*6b445a62SJohn Marino else
117*6b445a62SJohn Marino {
118*6b445a62SJohn Marino /* valid bytes */
119*6b445a62SJohn Marino point += tmp;
120*6b445a62SJohn Marino if (find_non_zero)
121*6b445a62SJohn Marino {
122*6b445a62SJohn Marino if (wcwidth (wc) == 0)
123*6b445a62SJohn Marino continue;
124*6b445a62SJohn Marino else
125*6b445a62SJohn Marino count--;
126*6b445a62SJohn Marino }
127*6b445a62SJohn Marino else
128*6b445a62SJohn Marino count--;
129*6b445a62SJohn Marino }
130*6b445a62SJohn Marino }
131*6b445a62SJohn Marino
132*6b445a62SJohn Marino if (find_non_zero)
133*6b445a62SJohn Marino {
134*6b445a62SJohn Marino tmp = mbrtowc (&wc, string + point, strlen (string + point), &ps);
135*6b445a62SJohn Marino while (MB_NULLWCH (tmp) == 0 && MB_INVALIDCH (tmp) == 0 && wcwidth (wc) == 0)
136*6b445a62SJohn Marino {
137*6b445a62SJohn Marino point += tmp;
138*6b445a62SJohn Marino tmp = mbrtowc (&wc, string + point, strlen (string + point), &ps);
139*6b445a62SJohn Marino }
140*6b445a62SJohn Marino }
141*6b445a62SJohn Marino
142*6b445a62SJohn Marino return point;
143*6b445a62SJohn Marino }
144*6b445a62SJohn Marino
145*6b445a62SJohn Marino static int
_rl_find_prev_mbchar_internal(string,seed,find_non_zero)146*6b445a62SJohn Marino _rl_find_prev_mbchar_internal (string, seed, find_non_zero)
147*6b445a62SJohn Marino char *string;
148*6b445a62SJohn Marino int seed, find_non_zero;
149*6b445a62SJohn Marino {
150*6b445a62SJohn Marino mbstate_t ps;
151*6b445a62SJohn Marino int prev, non_zero_prev, point, length;
152*6b445a62SJohn Marino size_t tmp;
153*6b445a62SJohn Marino wchar_t wc;
154*6b445a62SJohn Marino
155*6b445a62SJohn Marino memset(&ps, 0, sizeof(mbstate_t));
156*6b445a62SJohn Marino length = strlen(string);
157*6b445a62SJohn Marino
158*6b445a62SJohn Marino if (seed < 0)
159*6b445a62SJohn Marino return 0;
160*6b445a62SJohn Marino else if (length < seed)
161*6b445a62SJohn Marino return length;
162*6b445a62SJohn Marino
163*6b445a62SJohn Marino prev = non_zero_prev = point = 0;
164*6b445a62SJohn Marino while (point < seed)
165*6b445a62SJohn Marino {
166*6b445a62SJohn Marino tmp = mbrtowc (&wc, string + point, length - point, &ps);
167*6b445a62SJohn Marino if (MB_INVALIDCH ((size_t)tmp))
168*6b445a62SJohn Marino {
169*6b445a62SJohn Marino /* in this case, bytes are invalid or shorted to compose
170*6b445a62SJohn Marino multibyte char, so assume that the first byte represents
171*6b445a62SJohn Marino a single character anyway. */
172*6b445a62SJohn Marino tmp = 1;
173*6b445a62SJohn Marino /* clear the state of the byte sequence, because
174*6b445a62SJohn Marino in this case effect of mbstate is undefined */
175*6b445a62SJohn Marino memset(&ps, 0, sizeof (mbstate_t));
176*6b445a62SJohn Marino
177*6b445a62SJohn Marino /* Since we're assuming that this byte represents a single
178*6b445a62SJohn Marino non-zero-width character, don't forget about it. */
179*6b445a62SJohn Marino prev = point;
180*6b445a62SJohn Marino }
181*6b445a62SJohn Marino else if (MB_NULLWCH (tmp))
182*6b445a62SJohn Marino break; /* Found '\0' char. Can this happen? */
183*6b445a62SJohn Marino else
184*6b445a62SJohn Marino {
185*6b445a62SJohn Marino if (find_non_zero)
186*6b445a62SJohn Marino {
187*6b445a62SJohn Marino if (wcwidth (wc) != 0)
188*6b445a62SJohn Marino prev = point;
189*6b445a62SJohn Marino }
190*6b445a62SJohn Marino else
191*6b445a62SJohn Marino prev = point;
192*6b445a62SJohn Marino }
193*6b445a62SJohn Marino
194*6b445a62SJohn Marino point += tmp;
195*6b445a62SJohn Marino }
196*6b445a62SJohn Marino
197*6b445a62SJohn Marino return prev;
198*6b445a62SJohn Marino }
199*6b445a62SJohn Marino
200*6b445a62SJohn Marino /* return the number of bytes parsed from the multibyte sequence starting
201*6b445a62SJohn Marino at src, if a non-L'\0' wide character was recognized. It returns 0,
202*6b445a62SJohn Marino if a L'\0' wide character was recognized. It returns (size_t)(-1),
203*6b445a62SJohn Marino if an invalid multibyte sequence was encountered. It returns (size_t)(-2)
204*6b445a62SJohn Marino if it couldn't parse a complete multibyte character. */
205*6b445a62SJohn Marino int
_rl_get_char_len(src,ps)206*6b445a62SJohn Marino _rl_get_char_len (src, ps)
207*6b445a62SJohn Marino char *src;
208*6b445a62SJohn Marino mbstate_t *ps;
209*6b445a62SJohn Marino {
210*6b445a62SJohn Marino size_t tmp;
211*6b445a62SJohn Marino
212*6b445a62SJohn Marino tmp = mbrlen((const char *)src, (size_t)strlen (src), ps);
213*6b445a62SJohn Marino if (tmp == (size_t)(-2))
214*6b445a62SJohn Marino {
215*6b445a62SJohn Marino /* shorted to compose multibyte char */
216*6b445a62SJohn Marino if (ps)
217*6b445a62SJohn Marino memset (ps, 0, sizeof(mbstate_t));
218*6b445a62SJohn Marino return -2;
219*6b445a62SJohn Marino }
220*6b445a62SJohn Marino else if (tmp == (size_t)(-1))
221*6b445a62SJohn Marino {
222*6b445a62SJohn Marino /* invalid to compose multibyte char */
223*6b445a62SJohn Marino /* initialize the conversion state */
224*6b445a62SJohn Marino if (ps)
225*6b445a62SJohn Marino memset (ps, 0, sizeof(mbstate_t));
226*6b445a62SJohn Marino return -1;
227*6b445a62SJohn Marino }
228*6b445a62SJohn Marino else if (tmp == (size_t)0)
229*6b445a62SJohn Marino return 0;
230*6b445a62SJohn Marino else
231*6b445a62SJohn Marino return (int)tmp;
232*6b445a62SJohn Marino }
233*6b445a62SJohn Marino
234*6b445a62SJohn Marino /* compare the specified two characters. If the characters matched,
235*6b445a62SJohn Marino return 1. Otherwise return 0. */
236*6b445a62SJohn Marino int
_rl_compare_chars(buf1,pos1,ps1,buf2,pos2,ps2)237*6b445a62SJohn Marino _rl_compare_chars (buf1, pos1, ps1, buf2, pos2, ps2)
238*6b445a62SJohn Marino char *buf1;
239*6b445a62SJohn Marino int pos1;
240*6b445a62SJohn Marino mbstate_t *ps1;
241*6b445a62SJohn Marino char *buf2;
242*6b445a62SJohn Marino int pos2;
243*6b445a62SJohn Marino mbstate_t *ps2;
244*6b445a62SJohn Marino {
245*6b445a62SJohn Marino int i, w1, w2;
246*6b445a62SJohn Marino
247*6b445a62SJohn Marino if ((w1 = _rl_get_char_len (&buf1[pos1], ps1)) <= 0 ||
248*6b445a62SJohn Marino (w2 = _rl_get_char_len (&buf2[pos2], ps2)) <= 0 ||
249*6b445a62SJohn Marino (w1 != w2) ||
250*6b445a62SJohn Marino (buf1[pos1] != buf2[pos2]))
251*6b445a62SJohn Marino return 0;
252*6b445a62SJohn Marino
253*6b445a62SJohn Marino for (i = 1; i < w1; i++)
254*6b445a62SJohn Marino if (buf1[pos1+i] != buf2[pos2+i])
255*6b445a62SJohn Marino return 0;
256*6b445a62SJohn Marino
257*6b445a62SJohn Marino return 1;
258*6b445a62SJohn Marino }
259*6b445a62SJohn Marino
260*6b445a62SJohn Marino /* adjust pointed byte and find mbstate of the point of string.
261*6b445a62SJohn Marino adjusted point will be point <= adjusted_point, and returns
262*6b445a62SJohn Marino differences of the byte(adjusted_point - point).
263*6b445a62SJohn Marino if point is invalied (point < 0 || more than string length),
264*6b445a62SJohn Marino it returns -1 */
265*6b445a62SJohn Marino int
_rl_adjust_point(string,point,ps)266*6b445a62SJohn Marino _rl_adjust_point(string, point, ps)
267*6b445a62SJohn Marino char *string;
268*6b445a62SJohn Marino int point;
269*6b445a62SJohn Marino mbstate_t *ps;
270*6b445a62SJohn Marino {
271*6b445a62SJohn Marino size_t tmp = 0;
272*6b445a62SJohn Marino int length;
273*6b445a62SJohn Marino int pos = 0;
274*6b445a62SJohn Marino
275*6b445a62SJohn Marino length = strlen(string);
276*6b445a62SJohn Marino if (point < 0)
277*6b445a62SJohn Marino return -1;
278*6b445a62SJohn Marino if (length < point)
279*6b445a62SJohn Marino return -1;
280*6b445a62SJohn Marino
281*6b445a62SJohn Marino while (pos < point)
282*6b445a62SJohn Marino {
283*6b445a62SJohn Marino tmp = mbrlen (string + pos, length - pos, ps);
284*6b445a62SJohn Marino if (MB_INVALIDCH ((size_t)tmp))
285*6b445a62SJohn Marino {
286*6b445a62SJohn Marino /* in this case, bytes are invalid or shorted to compose
287*6b445a62SJohn Marino multibyte char, so assume that the first byte represents
288*6b445a62SJohn Marino a single character anyway. */
289*6b445a62SJohn Marino pos++;
290*6b445a62SJohn Marino /* clear the state of the byte sequence, because
291*6b445a62SJohn Marino in this case effect of mbstate is undefined */
292*6b445a62SJohn Marino if (ps)
293*6b445a62SJohn Marino memset (ps, 0, sizeof (mbstate_t));
294*6b445a62SJohn Marino }
295*6b445a62SJohn Marino else if (MB_NULLWCH (tmp))
296*6b445a62SJohn Marino pos++;
297*6b445a62SJohn Marino else
298*6b445a62SJohn Marino pos += tmp;
299*6b445a62SJohn Marino }
300*6b445a62SJohn Marino
301*6b445a62SJohn Marino return (pos - point);
302*6b445a62SJohn Marino }
303*6b445a62SJohn Marino
304*6b445a62SJohn Marino int
_rl_is_mbchar_matched(string,seed,end,mbchar,length)305*6b445a62SJohn Marino _rl_is_mbchar_matched (string, seed, end, mbchar, length)
306*6b445a62SJohn Marino char *string;
307*6b445a62SJohn Marino int seed, end;
308*6b445a62SJohn Marino char *mbchar;
309*6b445a62SJohn Marino int length;
310*6b445a62SJohn Marino {
311*6b445a62SJohn Marino int i;
312*6b445a62SJohn Marino
313*6b445a62SJohn Marino if ((end - seed) < length)
314*6b445a62SJohn Marino return 0;
315*6b445a62SJohn Marino
316*6b445a62SJohn Marino for (i = 0; i < length; i++)
317*6b445a62SJohn Marino if (string[seed + i] != mbchar[i])
318*6b445a62SJohn Marino return 0;
319*6b445a62SJohn Marino return 1;
320*6b445a62SJohn Marino }
321*6b445a62SJohn Marino
322*6b445a62SJohn Marino wchar_t
_rl_char_value(buf,ind)323*6b445a62SJohn Marino _rl_char_value (buf, ind)
324*6b445a62SJohn Marino char *buf;
325*6b445a62SJohn Marino int ind;
326*6b445a62SJohn Marino {
327*6b445a62SJohn Marino size_t tmp;
328*6b445a62SJohn Marino wchar_t wc;
329*6b445a62SJohn Marino mbstate_t ps;
330*6b445a62SJohn Marino int l;
331*6b445a62SJohn Marino
332*6b445a62SJohn Marino if (MB_LEN_MAX == 1 || rl_byte_oriented)
333*6b445a62SJohn Marino return ((wchar_t) buf[ind]);
334*6b445a62SJohn Marino l = strlen (buf);
335*6b445a62SJohn Marino if (ind >= l - 1)
336*6b445a62SJohn Marino return ((wchar_t) buf[ind]);
337*6b445a62SJohn Marino memset (&ps, 0, sizeof (mbstate_t));
338*6b445a62SJohn Marino tmp = mbrtowc (&wc, buf + ind, l - ind, &ps);
339*6b445a62SJohn Marino if (MB_INVALIDCH (tmp) || MB_NULLWCH (tmp))
340*6b445a62SJohn Marino return ((wchar_t) buf[ind]);
341*6b445a62SJohn Marino return wc;
342*6b445a62SJohn Marino }
343*6b445a62SJohn Marino #endif /* HANDLE_MULTIBYTE */
344*6b445a62SJohn Marino
345*6b445a62SJohn Marino /* Find next `count' characters started byte point of the specified seed.
346*6b445a62SJohn Marino If flags is MB_FIND_NONZERO, we look for non-zero-width multibyte
347*6b445a62SJohn Marino characters. */
348*6b445a62SJohn Marino #undef _rl_find_next_mbchar
349*6b445a62SJohn Marino int
_rl_find_next_mbchar(string,seed,count,flags)350*6b445a62SJohn Marino _rl_find_next_mbchar (string, seed, count, flags)
351*6b445a62SJohn Marino char *string;
352*6b445a62SJohn Marino int seed, count, flags;
353*6b445a62SJohn Marino {
354*6b445a62SJohn Marino #if defined (HANDLE_MULTIBYTE)
355*6b445a62SJohn Marino return _rl_find_next_mbchar_internal (string, seed, count, flags);
356*6b445a62SJohn Marino #else
357*6b445a62SJohn Marino return (seed + count);
358*6b445a62SJohn Marino #endif
359*6b445a62SJohn Marino }
360*6b445a62SJohn Marino
361*6b445a62SJohn Marino /* Find previous character started byte point of the specified seed.
362*6b445a62SJohn Marino Returned point will be point <= seed. If flags is MB_FIND_NONZERO,
363*6b445a62SJohn Marino we look for non-zero-width multibyte characters. */
364*6b445a62SJohn Marino #undef _rl_find_prev_mbchar
365*6b445a62SJohn Marino int
_rl_find_prev_mbchar(string,seed,flags)366*6b445a62SJohn Marino _rl_find_prev_mbchar (string, seed, flags)
367*6b445a62SJohn Marino char *string;
368*6b445a62SJohn Marino int seed, flags;
369*6b445a62SJohn Marino {
370*6b445a62SJohn Marino #if defined (HANDLE_MULTIBYTE)
371*6b445a62SJohn Marino return _rl_find_prev_mbchar_internal (string, seed, flags);
372*6b445a62SJohn Marino #else
373*6b445a62SJohn Marino return ((seed == 0) ? seed : seed - 1);
374*6b445a62SJohn Marino #endif
375*6b445a62SJohn Marino }
376