1 /* Conversion UTF-8 to UCS-4.
2 Copyright (C) 2001-2002 Free Software Foundation, Inc.
3 Written by Bruno Haible <haible@clisp.cons.org>, 2001.
4
5 This program is free software; you can redistribute it and/or modify
6 it under the terms of the GNU General Public License as published by
7 the Free Software Foundation; either version 2, or (at your option)
8 any later version.
9
10 This program is distributed in the hope that it will be useful,
11 but WITHOUT ANY WARRANTY; without even the implied warranty of
12 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
13 GNU General Public License for more details.
14
15 You should have received a copy of the GNU General Public License
16 along with this program; if not, write to the Free Software Foundation,
17 Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA. */
18
19
20 #include <stddef.h>
21
22 /* Return the length (number of units) of the first character in S, putting
23 its 'ucs4_t' representation in *PUC. */
24 static int
u8_mbtouc_aux(unsigned int * puc,const unsigned char * s,size_t n)25 u8_mbtouc_aux (unsigned int *puc, const unsigned char *s, size_t n)
26 {
27 unsigned char c = *s;
28
29 if (c >= 0xc2)
30 {
31 if (c < 0xe0)
32 {
33 if (n >= 2)
34 {
35 if ((s[1] ^ 0x80) < 0x40)
36 {
37 *puc = ((unsigned int) (c & 0x1f) << 6)
38 | (unsigned int) (s[1] ^ 0x80);
39 return 2;
40 }
41 /* invalid multibyte character */
42 }
43 else
44 {
45 /* incomplete multibyte character */
46 *puc = 0xfffd;
47 return n;
48 }
49 }
50 else if (c < 0xf0)
51 {
52 if (n >= 3)
53 {
54 if ((s[1] ^ 0x80) < 0x40 && (s[2] ^ 0x80) < 0x40
55 && (c >= 0xe1 || s[1] >= 0xa0))
56 {
57 *puc = ((unsigned int) (c & 0x0f) << 12)
58 | ((unsigned int) (s[1] ^ 0x80) << 6)
59 | (unsigned int) (s[2] ^ 0x80);
60 return 3;
61 }
62 /* invalid multibyte character */
63 }
64 else
65 {
66 /* incomplete multibyte character */
67 *puc = 0xfffd;
68 return n;
69 }
70 }
71 else if (c < 0xf8)
72 {
73 if (n >= 4)
74 {
75 if ((s[1] ^ 0x80) < 0x40 && (s[2] ^ 0x80) < 0x40
76 && (s[3] ^ 0x80) < 0x40
77 && (c >= 0xf1 || s[1] >= 0x90)
78 #if 1
79 && (c < 0xf4 || (c == 0xf4 && s[1] < 0x90))
80 #endif
81 )
82 {
83 *puc = ((unsigned int) (c & 0x07) << 18)
84 | ((unsigned int) (s[1] ^ 0x80) << 12)
85 | ((unsigned int) (s[2] ^ 0x80) << 6)
86 | (unsigned int) (s[3] ^ 0x80);
87 return 4;
88 }
89 /* invalid multibyte character */
90 }
91 else
92 {
93 /* incomplete multibyte character */
94 *puc = 0xfffd;
95 return n;
96 }
97 }
98 #if 0
99 else if (c < 0xfc)
100 {
101 if (n >= 5)
102 {
103 if ((s[1] ^ 0x80) < 0x40 && (s[2] ^ 0x80) < 0x40
104 && (s[3] ^ 0x80) < 0x40 && (s[4] ^ 0x80) < 0x40
105 && (c >= 0xf9 || s[1] >= 0x88))
106 {
107 *puc = ((unsigned int) (c & 0x03) << 24)
108 | ((unsigned int) (s[1] ^ 0x80) << 18)
109 | ((unsigned int) (s[2] ^ 0x80) << 12)
110 | ((unsigned int) (s[3] ^ 0x80) << 6)
111 | (unsigned int) (s[4] ^ 0x80);
112 return 5;
113 }
114 /* invalid multibyte character */
115 }
116 else
117 {
118 /* incomplete multibyte character */
119 *puc = 0xfffd;
120 return n;
121 }
122 }
123 else if (c < 0xfe)
124 {
125 if (n >= 6)
126 {
127 if ((s[1] ^ 0x80) < 0x40 && (s[2] ^ 0x80) < 0x40
128 && (s[3] ^ 0x80) < 0x40 && (s[4] ^ 0x80) < 0x40
129 && (s[5] ^ 0x80) < 0x40
130 && (c >= 0xfd || s[1] >= 0x84))
131 {
132 *puc = ((unsigned int) (c & 0x01) << 30)
133 | ((unsigned int) (s[1] ^ 0x80) << 24)
134 | ((unsigned int) (s[2] ^ 0x80) << 18)
135 | ((unsigned int) (s[3] ^ 0x80) << 12)
136 | ((unsigned int) (s[4] ^ 0x80) << 6)
137 | (unsigned int) (s[5] ^ 0x80);
138 return 6;
139 }
140 /* invalid multibyte character */
141 }
142 else
143 {
144 /* incomplete multibyte character */
145 *puc = 0xfffd;
146 return n;
147 }
148 }
149 #endif
150 }
151 /* invalid multibyte character */
152 *puc = 0xfffd;
153 return 1;
154 }
155 static inline int
u8_mbtouc(unsigned int * puc,const unsigned char * s,size_t n)156 u8_mbtouc (unsigned int *puc, const unsigned char *s, size_t n)
157 {
158 unsigned char c = *s;
159
160 if (c < 0x80)
161 {
162 *puc = c;
163 return 1;
164 }
165 else
166 return u8_mbtouc_aux (puc, s, n);
167 }
168