1e0b8e63eSJohn Marino /*-
2e0b8e63eSJohn Marino * Copyright (c) 2011, 2012
3e0b8e63eSJohn Marino * Zhihao Yuan. All rights reserved.
4e0b8e63eSJohn Marino *
5e0b8e63eSJohn Marino * See the LICENSE file for redistribution information.
6e0b8e63eSJohn Marino */
7e0b8e63eSJohn Marino
8e0b8e63eSJohn Marino #include <sys/types.h>
9e0b8e63eSJohn Marino
10e0b8e63eSJohn Marino int looks_utf8(const char *, size_t);
11e0b8e63eSJohn Marino int looks_utf16(const char *, size_t);
12e0b8e63eSJohn Marino int decode_utf8(const char *);
13e0b8e63eSJohn Marino int decode_utf16(const char *, int);
14e0b8e63eSJohn Marino
15e0b8e63eSJohn Marino #define F 0 /* character never appears in text */
16e0b8e63eSJohn Marino #define T 1 /* character appears in plain ASCII text */
17e0b8e63eSJohn Marino #define I 2 /* character appears in ISO-8859 text */
18e0b8e63eSJohn Marino #define X 3 /* character appears in non-ISO extended ASCII (Mac, IBM PC) */
19e0b8e63eSJohn Marino
20e0b8e63eSJohn Marino static char text_chars[256] = {
21e0b8e63eSJohn Marino /* BEL BS HT LF FF CR */
22e0b8e63eSJohn Marino F, F, F, F, F, F, F, T, T, T, T, F, T, T, F, F, /* 0x0X */
23e0b8e63eSJohn Marino /* ESC */
24e0b8e63eSJohn Marino F, F, F, F, F, F, F, F, F, F, F, T, F, F, F, F, /* 0x1X */
25e0b8e63eSJohn Marino T, T, T, T, T, T, T, T, T, T, T, T, T, T, T, T, /* 0x2X */
26e0b8e63eSJohn Marino T, T, T, T, T, T, T, T, T, T, T, T, T, T, T, T, /* 0x3X */
27e0b8e63eSJohn Marino T, T, T, T, T, T, T, T, T, T, T, T, T, T, T, T, /* 0x4X */
28e0b8e63eSJohn Marino T, T, T, T, T, T, T, T, T, T, T, T, T, T, T, T, /* 0x5X */
29e0b8e63eSJohn Marino T, T, T, T, T, T, T, T, T, T, T, T, T, T, T, T, /* 0x6X */
30e0b8e63eSJohn Marino T, T, T, T, T, T, T, T, T, T, T, T, T, T, T, F, /* 0x7X */
31e0b8e63eSJohn Marino /* NEL */
32e0b8e63eSJohn Marino X, X, X, X, X, T, X, X, X, X, X, X, X, X, X, X, /* 0x8X */
33e0b8e63eSJohn Marino X, X, X, X, X, X, X, X, X, X, X, X, X, X, X, X, /* 0x9X */
34e0b8e63eSJohn Marino I, I, I, I, I, I, I, I, I, I, I, I, I, I, I, I, /* 0xaX */
35e0b8e63eSJohn Marino I, I, I, I, I, I, I, I, I, I, I, I, I, I, I, I, /* 0xbX */
36e0b8e63eSJohn Marino I, I, I, I, I, I, I, I, I, I, I, I, I, I, I, I, /* 0xcX */
37e0b8e63eSJohn Marino I, I, I, I, I, I, I, I, I, I, I, I, I, I, I, I, /* 0xdX */
38e0b8e63eSJohn Marino I, I, I, I, I, I, I, I, I, I, I, I, I, I, I, I, /* 0xeX */
39e0b8e63eSJohn Marino I, I, I, I, I, I, I, I, I, I, I, I, I, I, I, I /* 0xfX */
40e0b8e63eSJohn Marino };
41e0b8e63eSJohn Marino
42e0b8e63eSJohn Marino /*
43e0b8e63eSJohn Marino * looks_utf8 --
44e0b8e63eSJohn Marino * Decide whether some text looks like UTF-8. Returns:
45e0b8e63eSJohn Marino *
46e0b8e63eSJohn Marino * -1: invalid UTF-8
47e0b8e63eSJohn Marino * 0: uses odd control characters, so doesn't look like text
48e0b8e63eSJohn Marino * 1: 7-bit text
49e0b8e63eSJohn Marino * 2: definitely UTF-8 text (valid high-bit set bytes)
50e0b8e63eSJohn Marino *
51e0b8e63eSJohn Marino * Based on RFC 3629. UTF-8 with BOM is not accepted.
52e0b8e63eSJohn Marino *
53e0b8e63eSJohn Marino * PUBLIC: int looks_utf8(const char *, size_t);
54e0b8e63eSJohn Marino */
55e0b8e63eSJohn Marino int
looks_utf8(const char * ibuf,size_t nbytes)56e0b8e63eSJohn Marino looks_utf8(const char *ibuf, size_t nbytes)
57e0b8e63eSJohn Marino {
58e0b8e63eSJohn Marino const u_char *buf = (u_char *)ibuf;
59e0b8e63eSJohn Marino size_t i;
60e0b8e63eSJohn Marino int n;
61e0b8e63eSJohn Marino int gotone = 0, ctrl = 0;
62e0b8e63eSJohn Marino
63e0b8e63eSJohn Marino for (i = 0; i < nbytes; i++) {
64e0b8e63eSJohn Marino if ((buf[i] & 0x80) == 0) { /* 0xxxxxxx is plain ASCII */
65e0b8e63eSJohn Marino /*
66e0b8e63eSJohn Marino * Even if the whole file is valid UTF-8 sequences,
67e0b8e63eSJohn Marino * still reject it if it uses weird control characters.
68e0b8e63eSJohn Marino */
69e0b8e63eSJohn Marino
70e0b8e63eSJohn Marino if (text_chars[buf[i]] != T)
71e0b8e63eSJohn Marino ctrl = 1;
72e0b8e63eSJohn Marino } else if ((buf[i] & 0x40) == 0) { /* 10xxxxxx never 1st byte */
73e0b8e63eSJohn Marino return -1;
74e0b8e63eSJohn Marino } else { /* 11xxxxxx begins UTF-8 */
75e0b8e63eSJohn Marino int following;
76e0b8e63eSJohn Marino
77e0b8e63eSJohn Marino if ((buf[i] & 0x20) == 0) /* 110xxxxx */
78e0b8e63eSJohn Marino if (buf[i] > 0xC1) /* C0, C1 */
79e0b8e63eSJohn Marino following = 1;
80e0b8e63eSJohn Marino else return -1;
81e0b8e63eSJohn Marino else if ((buf[i] & 0x10) == 0) /* 1110xxxx */
82e0b8e63eSJohn Marino following = 2;
83e0b8e63eSJohn Marino else if ((buf[i] & 0x08) == 0) /* 11110xxx */
84e0b8e63eSJohn Marino if (buf[i] < 0xF5)
85e0b8e63eSJohn Marino following = 3;
86e0b8e63eSJohn Marino else return -1; /* F5, F6, F7 */
87e0b8e63eSJohn Marino else
88e0b8e63eSJohn Marino return -1; /* F8~FF */
89e0b8e63eSJohn Marino
90e0b8e63eSJohn Marino for (n = 0; n < following; n++) {
91e0b8e63eSJohn Marino i++;
92e0b8e63eSJohn Marino if (i >= nbytes)
93e0b8e63eSJohn Marino goto done;
94e0b8e63eSJohn Marino
95*b1ac2ebbSDaniel Fojt if ((buf[i] & 0xc0) != 0x80) /* 10xxxxxx */
96e0b8e63eSJohn Marino return -1;
97e0b8e63eSJohn Marino }
98e0b8e63eSJohn Marino
99e0b8e63eSJohn Marino gotone = 1;
100e0b8e63eSJohn Marino }
101e0b8e63eSJohn Marino }
102e0b8e63eSJohn Marino done:
103e0b8e63eSJohn Marino return ctrl ? 0 : (gotone ? 2 : 1);
104e0b8e63eSJohn Marino }
105e0b8e63eSJohn Marino
106e0b8e63eSJohn Marino /*
107e0b8e63eSJohn Marino * looks_utf16 --
108e0b8e63eSJohn Marino * Decide whether some text looks like UTF-16. Returns:
109e0b8e63eSJohn Marino *
110e0b8e63eSJohn Marino * 0: invalid UTF-16
111e0b8e63eSJohn Marino * 1: Little-endian UTF-16
112e0b8e63eSJohn Marino * 2: Big-endian UTF-16
113e0b8e63eSJohn Marino *
114e0b8e63eSJohn Marino * PUBLIC: int looks_utf16(const char *, size_t);
115e0b8e63eSJohn Marino */
116e0b8e63eSJohn Marino int
looks_utf16(const char * ibuf,size_t nbytes)117e0b8e63eSJohn Marino looks_utf16(const char *ibuf, size_t nbytes)
118e0b8e63eSJohn Marino {
119e0b8e63eSJohn Marino const u_char *buf = (u_char *)ibuf;
120e0b8e63eSJohn Marino int bigend;
121e0b8e63eSJohn Marino size_t i;
122e0b8e63eSJohn Marino unsigned int c;
123e0b8e63eSJohn Marino int bom;
124e0b8e63eSJohn Marino int following = 0;
125e0b8e63eSJohn Marino
126e0b8e63eSJohn Marino if (nbytes < 2)
127e0b8e63eSJohn Marino return 0;
128e0b8e63eSJohn Marino
129e0b8e63eSJohn Marino bom = buf[0] << 8 ^ buf[1];
130e0b8e63eSJohn Marino if (bom == 0xFFFE)
131e0b8e63eSJohn Marino bigend = 0;
132e0b8e63eSJohn Marino else if (bom == 0xFEFF)
133e0b8e63eSJohn Marino bigend = 1;
134e0b8e63eSJohn Marino else
135e0b8e63eSJohn Marino return 0;
136e0b8e63eSJohn Marino
137e0b8e63eSJohn Marino for (i = 2; i + 1 < nbytes; i += 2) {
138e0b8e63eSJohn Marino if (bigend)
139e0b8e63eSJohn Marino c = buf[i] << 8 ^ buf[i + 1];
140e0b8e63eSJohn Marino else
141e0b8e63eSJohn Marino c = buf[i] ^ buf[i + 1] << 8;
142e0b8e63eSJohn Marino
143e0b8e63eSJohn Marino if (!following)
144e0b8e63eSJohn Marino if (c < 0xD800 || c > 0xDFFF)
145e0b8e63eSJohn Marino if (c < 128 && text_chars[c] != T)
146e0b8e63eSJohn Marino return 0;
147e0b8e63eSJohn Marino else
148e0b8e63eSJohn Marino following = 0;
149e0b8e63eSJohn Marino else if (c > 0xDBFF)
150e0b8e63eSJohn Marino return 0;
151e0b8e63eSJohn Marino else {
152e0b8e63eSJohn Marino following = 1;
153e0b8e63eSJohn Marino continue;
154e0b8e63eSJohn Marino }
155e0b8e63eSJohn Marino else if (c < 0xDC00 || c > 0xDFFF)
156e0b8e63eSJohn Marino return 0;
157e0b8e63eSJohn Marino }
158e0b8e63eSJohn Marino
159e0b8e63eSJohn Marino return 1 + bigend;
160e0b8e63eSJohn Marino }
161e0b8e63eSJohn Marino
162e0b8e63eSJohn Marino #undef F
163e0b8e63eSJohn Marino #undef T
164e0b8e63eSJohn Marino #undef I
165e0b8e63eSJohn Marino #undef X
166e0b8e63eSJohn Marino
167e0b8e63eSJohn Marino /*
168e0b8e63eSJohn Marino * decode_utf8 --
169e0b8e63eSJohn Marino * Decode a UTF-8 character from byte string to Unicode.
170e0b8e63eSJohn Marino * Returns -1 if the first byte is a not UTF-8 leader.
171e0b8e63eSJohn Marino *
172e0b8e63eSJohn Marino * Based on RFC 3629, but without error detection.
173e0b8e63eSJohn Marino *
174e0b8e63eSJohn Marino * PUBLIC: int decode_utf8(const char *);
175e0b8e63eSJohn Marino */
176e0b8e63eSJohn Marino int
decode_utf8(const char * ibuf)177e0b8e63eSJohn Marino decode_utf8(const char *ibuf)
178e0b8e63eSJohn Marino {
179e0b8e63eSJohn Marino const u_char *buf = (u_char *)ibuf;
180e0b8e63eSJohn Marino int u = -1;
181e0b8e63eSJohn Marino
182e0b8e63eSJohn Marino if ((buf[0] & 0x80) == 0)
183e0b8e63eSJohn Marino u = buf[0];
184e0b8e63eSJohn Marino else if ((buf[0] & 0x40) == 0);
185e0b8e63eSJohn Marino else {
186e0b8e63eSJohn Marino if ((buf[0] & 0x20) == 0)
187e0b8e63eSJohn Marino u = (buf[0] ^ 0xC0) << 6 ^ (buf[1] ^ 0x80);
188e0b8e63eSJohn Marino else if ((buf[0] & 0x10) == 0)
189e0b8e63eSJohn Marino u = (buf[0] ^ 0xE0) << 12 ^ (buf[1] ^ 0x80) << 6
190e0b8e63eSJohn Marino ^ (buf[2] ^ 0x80);
191e0b8e63eSJohn Marino else if (((buf[0] & 0x08) == 0))
192e0b8e63eSJohn Marino u = (buf[0] ^ 0xF0) << 18 ^ (buf[1] ^ 0x80) << 12
193e0b8e63eSJohn Marino ^ (buf[2] ^ 0x80) << 6 ^ (buf[3] ^ 0x80);
194e0b8e63eSJohn Marino }
195e0b8e63eSJohn Marino
196e0b8e63eSJohn Marino return u;
197e0b8e63eSJohn Marino }
198e0b8e63eSJohn Marino
199e0b8e63eSJohn Marino /*
200e0b8e63eSJohn Marino * decode_utf16 --
201e0b8e63eSJohn Marino * Decode a UTF-16 character from byte string to Unicode.
202e0b8e63eSJohn Marino * Returns -1 if the first unsigned integer is invalid.
203e0b8e63eSJohn Marino *
204e0b8e63eSJohn Marino * No error detection on supplementary bytes.
205e0b8e63eSJohn Marino *
206e0b8e63eSJohn Marino * PUBLIC: int decode_utf16(const char *, int);
207e0b8e63eSJohn Marino */
208e0b8e63eSJohn Marino int
decode_utf16(const char * ibuf,int bigend)209e0b8e63eSJohn Marino decode_utf16(const char* ibuf, int bigend)
210e0b8e63eSJohn Marino {
211e0b8e63eSJohn Marino const u_char *buf = (u_char *)ibuf;
212e0b8e63eSJohn Marino int u = -1;
213e0b8e63eSJohn Marino unsigned int w1, w2;
214e0b8e63eSJohn Marino
215e0b8e63eSJohn Marino if (bigend)
216e0b8e63eSJohn Marino w1 = buf[0] << 8 ^ buf[1];
217e0b8e63eSJohn Marino else
218e0b8e63eSJohn Marino w1 = buf[0] ^ buf[1] << 8;
219e0b8e63eSJohn Marino
220e0b8e63eSJohn Marino if (w1 < 0xD800 || w1 > 0xDFFF)
221e0b8e63eSJohn Marino u = w1;
222e0b8e63eSJohn Marino else if (w1 > 0xDBFF);
223e0b8e63eSJohn Marino else {
224e0b8e63eSJohn Marino if (bigend)
225e0b8e63eSJohn Marino w2 = buf[2] << 8 ^ buf[3];
226e0b8e63eSJohn Marino else
227e0b8e63eSJohn Marino w2 = buf[2] ^ buf[3] << 8;
228e0b8e63eSJohn Marino u = ((w1 ^ 0xD800) << 10 ^ (w2 ^ 0xDC00)) + 0x10000;
229e0b8e63eSJohn Marino }
230e0b8e63eSJohn Marino
231e0b8e63eSJohn Marino return u;
232e0b8e63eSJohn Marino }
233