18dffb485Schristos /* Convert multibyte character to wide character. 2*4b169a6bSchristos Copyright (C) 1999-2002, 2005-2022 Free Software Foundation, Inc. 38dffb485Schristos 4*4b169a6bSchristos This file is free software: you can redistribute it and/or modify 5*4b169a6bSchristos it under the terms of the GNU Lesser General Public License as 6*4b169a6bSchristos published by the Free Software Foundation; either version 2.1 of the 7*4b169a6bSchristos License, or (at your option) any later version. 88dffb485Schristos 9*4b169a6bSchristos This file is distributed in the hope that it will be useful, 108dffb485Schristos but WITHOUT ANY WARRANTY; without even the implied warranty of 118dffb485Schristos MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 12*4b169a6bSchristos GNU Lesser General Public License for more details. 138dffb485Schristos 14*4b169a6bSchristos You should have received a copy of the GNU Lesser General Public License 158dffb485Schristos along with this program. If not, see <https://www.gnu.org/licenses/>. */ 168dffb485Schristos 178dffb485Schristos /* Written by Bruno Haible <bruno@clisp.org>, 2008. */ 188dffb485Schristos 198dffb485Schristos /* This file contains the body of the mbrtowc and mbrtoc32 functions, 208dffb485Schristos when GNULIB_defined_mbstate_t is defined. */ 218dffb485Schristos 228dffb485Schristos char *pstate = (char *)ps; 238dffb485Schristos 248dffb485Schristos if (s == NULL) 258dffb485Schristos { 268dffb485Schristos pwc = NULL; 278dffb485Schristos s = ""; 288dffb485Schristos n = 1; 298dffb485Schristos } 308dffb485Schristos 318dffb485Schristos if (n == 0) 328dffb485Schristos return (size_t)(-2); 338dffb485Schristos 348dffb485Schristos /* Here n > 0. */ 358dffb485Schristos 368dffb485Schristos if (pstate == NULL) 378dffb485Schristos pstate = internal_state; 388dffb485Schristos 398dffb485Schristos { 408dffb485Schristos size_t nstate = pstate[0]; 418dffb485Schristos char buf[4]; 428dffb485Schristos const char *p; 438dffb485Schristos size_t m; 448dffb485Schristos enc_t enc; 458dffb485Schristos int res; 468dffb485Schristos 478dffb485Schristos switch (nstate) 488dffb485Schristos { 498dffb485Schristos case 0: 508dffb485Schristos p = s; 518dffb485Schristos m = n; 528dffb485Schristos break; 538dffb485Schristos case 3: 548dffb485Schristos buf[2] = pstate[3]; 558dffb485Schristos FALLTHROUGH; 568dffb485Schristos case 2: 578dffb485Schristos buf[1] = pstate[2]; 588dffb485Schristos FALLTHROUGH; 598dffb485Schristos case 1: 608dffb485Schristos buf[0] = pstate[1]; 618dffb485Schristos p = buf; 628dffb485Schristos m = nstate; 638dffb485Schristos buf[m++] = s[0]; 648dffb485Schristos if (n >= 2 && m < 4) 658dffb485Schristos { 668dffb485Schristos buf[m++] = s[1]; 678dffb485Schristos if (n >= 3 && m < 4) 688dffb485Schristos buf[m++] = s[2]; 698dffb485Schristos } 708dffb485Schristos break; 718dffb485Schristos default: 728dffb485Schristos errno = EINVAL; 738dffb485Schristos return (size_t)(-1); 748dffb485Schristos } 758dffb485Schristos 768dffb485Schristos /* Here m > 0. */ 778dffb485Schristos 788dffb485Schristos enc = locale_encoding_classification (); 798dffb485Schristos 808dffb485Schristos if (enc == enc_utf8) /* UTF-8 */ 818dffb485Schristos { 828dffb485Schristos /* Achieve 838dffb485Schristos - multi-thread safety and 848dffb485Schristos - the ability to produce wide character values > WCHAR_MAX 858dffb485Schristos by not calling mbtowc() at all. */ 868dffb485Schristos #include "mbrtowc-impl-utf8.h" 878dffb485Schristos } 888dffb485Schristos else 898dffb485Schristos { 908dffb485Schristos /* The hidden internal state of mbtowc would make this function not 918dffb485Schristos multi-thread safe. Achieve multi-thread safety through a lock. */ 928dffb485Schristos wchar_t wc; 938dffb485Schristos res = mbtowc_with_lock (&wc, p, m); 948dffb485Schristos 958dffb485Schristos if (res >= 0) 968dffb485Schristos { 978dffb485Schristos if ((wc == 0) != (res == 0)) 988dffb485Schristos abort (); 998dffb485Schristos if (pwc != NULL) 1008dffb485Schristos *pwc = wc; 1018dffb485Schristos goto success; 1028dffb485Schristos } 1038dffb485Schristos 1048dffb485Schristos /* mbtowc does not distinguish between invalid and incomplete multibyte 1058dffb485Schristos sequences. But mbrtowc needs to make this distinction. 1068dffb485Schristos There are two possible approaches: 1078dffb485Schristos - Use iconv() and its return value. 1088dffb485Schristos - Use built-in knowledge about the possible encodings. 1098dffb485Schristos Given the low quality of implementation of iconv() on the systems 1108dffb485Schristos that lack mbrtowc(), we use the second approach. 1118dffb485Schristos The possible encodings are: 1128dffb485Schristos - 8-bit encodings, 1138dffb485Schristos - EUC-JP, EUC-KR, GB2312, EUC-TW, BIG5, GB18030, SJIS, 1148dffb485Schristos - UTF-8 (already handled above). 1158dffb485Schristos Use specialized code for each. */ 1168dffb485Schristos if (m >= 4 || m >= MB_CUR_MAX) 1178dffb485Schristos goto invalid; 1188dffb485Schristos /* Here MB_CUR_MAX > 1 and 0 < m < 4. */ 1198dffb485Schristos switch (enc) 1208dffb485Schristos { 1218dffb485Schristos /* As a reference for this code, you can use the GNU libiconv 1228dffb485Schristos implementation. Look for uses of the RET_TOOFEW macro. */ 1238dffb485Schristos 1248dffb485Schristos case enc_eucjp: /* EUC-JP */ 1258dffb485Schristos { 1268dffb485Schristos if (m == 1) 1278dffb485Schristos { 1288dffb485Schristos unsigned char c = (unsigned char) p[0]; 1298dffb485Schristos 1308dffb485Schristos if ((c >= 0xa1 && c < 0xff) || c == 0x8e || c == 0x8f) 1318dffb485Schristos goto incomplete; 1328dffb485Schristos } 1338dffb485Schristos if (m == 2) 1348dffb485Schristos { 1358dffb485Schristos unsigned char c = (unsigned char) p[0]; 1368dffb485Schristos 1378dffb485Schristos if (c == 0x8f) 1388dffb485Schristos { 1398dffb485Schristos unsigned char c2 = (unsigned char) p[1]; 1408dffb485Schristos 1418dffb485Schristos if (c2 >= 0xa1 && c2 < 0xff) 1428dffb485Schristos goto incomplete; 1438dffb485Schristos } 1448dffb485Schristos } 1458dffb485Schristos goto invalid; 1468dffb485Schristos } 1478dffb485Schristos 1488dffb485Schristos case enc_94: /* EUC-KR, GB2312, BIG5 */ 1498dffb485Schristos { 1508dffb485Schristos if (m == 1) 1518dffb485Schristos { 1528dffb485Schristos unsigned char c = (unsigned char) p[0]; 1538dffb485Schristos 1548dffb485Schristos if (c >= 0xa1 && c < 0xff) 1558dffb485Schristos goto incomplete; 1568dffb485Schristos } 1578dffb485Schristos goto invalid; 1588dffb485Schristos } 1598dffb485Schristos 1608dffb485Schristos case enc_euctw: /* EUC-TW */ 1618dffb485Schristos { 1628dffb485Schristos if (m == 1) 1638dffb485Schristos { 1648dffb485Schristos unsigned char c = (unsigned char) p[0]; 1658dffb485Schristos 1668dffb485Schristos if ((c >= 0xa1 && c < 0xff) || c == 0x8e) 1678dffb485Schristos goto incomplete; 1688dffb485Schristos } 1698dffb485Schristos else /* m == 2 || m == 3 */ 1708dffb485Schristos { 1718dffb485Schristos unsigned char c = (unsigned char) p[0]; 1728dffb485Schristos 1738dffb485Schristos if (c == 0x8e) 1748dffb485Schristos goto incomplete; 1758dffb485Schristos } 1768dffb485Schristos goto invalid; 1778dffb485Schristos } 1788dffb485Schristos 1798dffb485Schristos case enc_gb18030: /* GB18030 */ 1808dffb485Schristos { 1818dffb485Schristos if (m == 1) 1828dffb485Schristos { 1838dffb485Schristos unsigned char c = (unsigned char) p[0]; 1848dffb485Schristos 1858dffb485Schristos if ((c >= 0x90 && c <= 0xe3) || (c >= 0xf8 && c <= 0xfe)) 1868dffb485Schristos goto incomplete; 1878dffb485Schristos } 1888dffb485Schristos else /* m == 2 || m == 3 */ 1898dffb485Schristos { 1908dffb485Schristos unsigned char c = (unsigned char) p[0]; 1918dffb485Schristos 1928dffb485Schristos if (c >= 0x90 && c <= 0xe3) 1938dffb485Schristos { 1948dffb485Schristos unsigned char c2 = (unsigned char) p[1]; 1958dffb485Schristos 1968dffb485Schristos if (c2 >= 0x30 && c2 <= 0x39) 1978dffb485Schristos { 1988dffb485Schristos if (m == 2) 1998dffb485Schristos goto incomplete; 2008dffb485Schristos else /* m == 3 */ 2018dffb485Schristos { 2028dffb485Schristos unsigned char c3 = (unsigned char) p[2]; 2038dffb485Schristos 2048dffb485Schristos if (c3 >= 0x81 && c3 <= 0xfe) 2058dffb485Schristos goto incomplete; 2068dffb485Schristos } 2078dffb485Schristos } 2088dffb485Schristos } 2098dffb485Schristos } 2108dffb485Schristos goto invalid; 2118dffb485Schristos } 2128dffb485Schristos 2138dffb485Schristos case enc_sjis: /* SJIS */ 2148dffb485Schristos { 2158dffb485Schristos if (m == 1) 2168dffb485Schristos { 2178dffb485Schristos unsigned char c = (unsigned char) p[0]; 2188dffb485Schristos 2198dffb485Schristos if ((c >= 0x81 && c <= 0x9f) || (c >= 0xe0 && c <= 0xea) 2208dffb485Schristos || (c >= 0xf0 && c <= 0xf9)) 2218dffb485Schristos goto incomplete; 2228dffb485Schristos } 2238dffb485Schristos goto invalid; 2248dffb485Schristos } 2258dffb485Schristos 2268dffb485Schristos default: 2278dffb485Schristos /* An unknown multibyte encoding. */ 2288dffb485Schristos goto incomplete; 2298dffb485Schristos } 2308dffb485Schristos } 2318dffb485Schristos 2328dffb485Schristos success: 2338dffb485Schristos /* res >= 0 is the corrected return value of 2348dffb485Schristos mbtowc_with_lock (&wc, p, m). */ 2358dffb485Schristos if (nstate >= (res > 0 ? res : 1)) 2368dffb485Schristos abort (); 2378dffb485Schristos res -= nstate; 2388dffb485Schristos pstate[0] = 0; 2398dffb485Schristos return res; 2408dffb485Schristos 2418dffb485Schristos incomplete: 2428dffb485Schristos { 2438dffb485Schristos size_t k = nstate; 2448dffb485Schristos /* Here 0 <= k < m < 4. */ 2458dffb485Schristos pstate[++k] = s[0]; 2468dffb485Schristos if (k < m) 2478dffb485Schristos { 2488dffb485Schristos pstate[++k] = s[1]; 2498dffb485Schristos if (k < m) 2508dffb485Schristos pstate[++k] = s[2]; 2518dffb485Schristos } 2528dffb485Schristos if (k != m) 2538dffb485Schristos abort (); 2548dffb485Schristos } 2558dffb485Schristos pstate[0] = m; 2568dffb485Schristos return (size_t)(-2); 2578dffb485Schristos 2588dffb485Schristos invalid: 2598dffb485Schristos errno = EILSEQ; 2608dffb485Schristos /* The conversion state is undefined, says POSIX. */ 2618dffb485Schristos return (size_t)(-1); 2628dffb485Schristos } 263