xref: /spdk/include/spdk_internal/utf.h (revision a6dbe3721eb3b5990707fc3e378c95e505dd8ab5)
1 /*   SPDX-License-Identifier: BSD-3-Clause
2  *   Copyright (C) 2016 Intel Corporation.
3  *   All rights reserved.
4  */
5 
6 #ifndef SPDK_UTF_H_
7 #define SPDK_UTF_H_
8 
9 #include "spdk/stdinc.h"
10 
11 #include "spdk/endian.h"
12 #include "spdk/likely.h"
13 #include "spdk/string.h"
14 
15 static inline bool
utf8_tail(uint8_t c)16 utf8_tail(uint8_t c)
17 {
18 	/* c >= 0x80 && c <= 0xBF, or binary 01xxxxxx */
19 	return (c & 0xC0) == 0x80;
20 }
21 
22 /*
23  * Check for a valid UTF-8 encoding of a single codepoint.
24  *
25  * \return Length of valid UTF-8 byte sequence, or negative if invalid.
26  */
27 static inline int
utf8_valid(const uint8_t * start,const uint8_t * end)28 utf8_valid(const uint8_t *start, const uint8_t *end)
29 {
30 	const uint8_t *p = start;
31 	uint8_t b0, b1, b2, b3;
32 
33 	if (p == end) {
34 		return 0;
35 	}
36 
37 	b0 = *p;
38 
39 	if (b0 <= 0x7F) {
40 		return 1;
41 	}
42 
43 	if (b0 <= 0xC1) {
44 		/* Invalid start byte */
45 		return -1;
46 	}
47 
48 	if (++p == end) {
49 		/* Not enough bytes left */
50 		return -1;
51 	}
52 	b1 = *p;
53 
54 	if (b0 <= 0xDF) {
55 		/* C2..DF 80..BF */
56 		if (!utf8_tail(b1)) {
57 			return -1;
58 		}
59 		return 2;
60 	}
61 
62 	if (++p == end) {
63 		/* Not enough bytes left */
64 		return -1;
65 	}
66 	b2 = *p;
67 
68 	if (b0 == 0xE0) {
69 		/* E0 A0..BF 80..BF */
70 		if (b1 < 0xA0 || b1 > 0xBF || !utf8_tail(b2)) {
71 			return -1;
72 		}
73 		return 3;
74 	} else if (b0 == 0xED && b1 >= 0xA0) {
75 		/*
76 		 * UTF-16 surrogate pairs use U+D800..U+DFFF, which would be encoded as
77 		 * ED A0..BF 80..BF in UTF-8; however, surrogate pairs are not allowed in UTF-8.
78 		 */
79 		return -1;
80 	} else if (b0 <= 0xEF) {
81 		/* E1..EF 80..BF 80..BF */
82 		if (!utf8_tail(b1) || !utf8_tail(b2)) {
83 			return -1;
84 		}
85 		return 3;
86 	}
87 
88 	if (++p == end) {
89 		/* Not enough bytes left */
90 		return -1;
91 	}
92 	b3 = *p;
93 
94 	if (b0 == 0xF0) {
95 		/* F0 90..BF 80..BF 80..BF */
96 		if (b1 < 0x90 || b1 > 0xBF || !utf8_tail(b2) || !utf8_tail(b3)) {
97 			return -1;
98 		}
99 		return 4;
100 	} else if (b0 <= 0xF3) {
101 		/* F1..F3 80..BF 80..BF 80..BF */
102 		if (!utf8_tail(b1) || !utf8_tail(b2) || !utf8_tail(b3)) {
103 			return -1;
104 		}
105 		return 4;
106 	} else if (b0 == 0xF4) {
107 		/* F4 80..8F 80..BF 80..BF */
108 		if (b1 < 0x80 || b1 > 0x8F || !utf8_tail(b2) || !utf8_tail(b3)) {
109 			return -1;
110 		}
111 		return 4;
112 	}
113 
114 	return -1;
115 }
116 
117 static inline uint32_t
utf8_decode_unsafe_1(const uint8_t * data)118 utf8_decode_unsafe_1(const uint8_t *data)
119 {
120 	return data[0];
121 }
122 
123 static inline uint32_t
utf8_decode_unsafe_2(const uint8_t * data)124 utf8_decode_unsafe_2(const uint8_t *data)
125 {
126 	uint32_t codepoint;
127 
128 	codepoint = ((data[0] & 0x1F) << 6);
129 	codepoint |= (data[1] & 0x3F);
130 
131 	return codepoint;
132 }
133 
134 static inline uint32_t
utf8_decode_unsafe_3(const uint8_t * data)135 utf8_decode_unsafe_3(const uint8_t *data)
136 {
137 	uint32_t codepoint;
138 
139 	codepoint = ((data[0] & 0x0F) << 12);
140 	codepoint |= (data[1] & 0x3F) << 6;
141 	codepoint |= (data[2] & 0x3F);
142 
143 	return codepoint;
144 }
145 
146 static inline uint32_t
utf8_decode_unsafe_4(const uint8_t * data)147 utf8_decode_unsafe_4(const uint8_t *data)
148 {
149 	uint32_t codepoint;
150 
151 	codepoint = ((data[0] & 0x07) << 18);
152 	codepoint |= (data[1] & 0x3F) << 12;
153 	codepoint |= (data[2] & 0x3F) << 6;
154 	codepoint |= (data[3] & 0x3F);
155 
156 	return codepoint;
157 }
158 
159 /*
160  * Encode a single Unicode codepoint as UTF-8.
161  *
162  * buf must have at least 4 bytes of space available (hence unsafe).
163  *
164  * \return Number of bytes appended to buf, or negative if encoding failed.
165  */
166 static inline int
utf8_encode_unsafe(uint8_t * buf,uint32_t c)167 utf8_encode_unsafe(uint8_t *buf, uint32_t c)
168 {
169 	if (c <= 0x7F) {
170 		buf[0] = c;
171 		return 1;
172 	} else if (c <= 0x7FF) {
173 		buf[0] = 0xC0 | (c >> 6);
174 		buf[1] = 0x80 | (c & 0x3F);
175 		return 2;
176 	} else if (c >= 0xD800 && c <= 0xDFFF) {
177 		/* UTF-16 surrogate pairs - invalid in UTF-8 */
178 		return -1;
179 	} else if (c <= 0xFFFF) {
180 		buf[0] = 0xE0 | (c >> 12);
181 		buf[1] = 0x80 | ((c >> 6) & 0x3F);
182 		buf[2] = 0x80 | (c & 0x3F);
183 		return 3;
184 	} else if (c <= 0x10FFFF) {
185 		buf[0] = 0xF0 | (c >> 18);
186 		buf[1] = 0x80 | ((c >> 12) & 0x3F);
187 		buf[2] = 0x80 | ((c >> 6) & 0x3F);
188 		buf[3] = 0x80 | (c & 0x3F);
189 		return 4;
190 	}
191 	return -1;
192 }
193 
194 static inline int
utf8_codepoint_len(uint32_t c)195 utf8_codepoint_len(uint32_t c)
196 {
197 	if (c <= 0x7F) {
198 		return 1;
199 	} else if (c <= 0x7FF) {
200 		return 2;
201 	} else if (c >= 0xD800 && c <= 0xDFFF) {
202 		/* UTF-16 surrogate pairs - invalid in UTF-8 */
203 		return -1;
204 	} else if (c <= 0xFFFF) {
205 		return 3;
206 	} else if (c <= 0x10FFFF) {
207 		return 4;
208 	}
209 	return -1;
210 }
211 
212 static inline bool
utf16_valid_surrogate_high(uint32_t val)213 utf16_valid_surrogate_high(uint32_t val)
214 {
215 	return val >= 0xD800 && val <= 0xDBFF;
216 }
217 
218 static inline bool
utf16_valid_surrogate_low(uint32_t val)219 utf16_valid_surrogate_low(uint32_t val)
220 {
221 	return val >= 0xDC00 && val <= 0xDFFF;
222 }
223 
224 /*
225  * Check for a valid UTF-16LE encoding of a single codepoint.
226  *
227  * \return Length of valid UTF-16LE sequence in 16-bit code units, or negative if invalid.
228  */
229 static inline int
utf16le_valid(const uint16_t * start,const uint16_t * end)230 utf16le_valid(const uint16_t *start, const uint16_t *end)
231 {
232 	const uint16_t *p = start;
233 	uint16_t high, low;
234 
235 	if (p == end) {
236 		return 0;
237 	}
238 
239 	high = from_le16(p);
240 
241 	if (high <= 0xD7FF || high >= 0xE000) {
242 		/* Single code unit in BMP */
243 		return 1;
244 	}
245 
246 	if (high >= 0xDC00) {
247 		/* Low surrogate in first code unit - invalid */
248 		return -1;
249 	}
250 
251 	assert(utf16_valid_surrogate_high(high));
252 
253 	if (++p == end) {
254 		/* Not enough code units left */
255 		return -1;
256 	}
257 	low = from_le16(p);
258 
259 	if (!utf16_valid_surrogate_low(low)) {
260 		return -1;
261 	}
262 
263 	/* Valid surrogate pair */
264 	return 2;
265 }
266 
267 static inline uint32_t
utf16_decode_surrogate_pair(uint32_t high,uint32_t low)268 utf16_decode_surrogate_pair(uint32_t high, uint32_t low)
269 {
270 	uint32_t codepoint;
271 
272 	assert(utf16_valid_surrogate_high(high));
273 	assert(utf16_valid_surrogate_low(low));
274 
275 	codepoint = low;
276 	codepoint &= 0x3FF;
277 	codepoint |= ((high & 0x3FF) << 10);
278 	codepoint += 0x10000;
279 
280 	return codepoint;
281 }
282 
283 static inline void
utf16_encode_surrogate_pair(uint32_t codepoint,uint16_t * high,uint16_t * low)284 utf16_encode_surrogate_pair(uint32_t codepoint, uint16_t *high, uint16_t *low)
285 {
286 	assert(codepoint >= 0x10000);
287 	assert(codepoint <= 0x10FFFF);
288 
289 	codepoint -= 0x10000;
290 	*high = 0xD800 | (codepoint >> 10);
291 	*low = 0xDC00 | (codepoint & 0x3FF);
292 
293 	assert(utf16_valid_surrogate_high(*high));
294 	assert(utf16_valid_surrogate_low(*low));
295 }
296 
297 #endif
298