xref: /spdk/include/spdk_internal/utf.h (revision b30d57cdad6d2bc75cc1e4e2ebbcebcb0d98dcfa)
1 /*-
2  *   BSD LICENSE
3  *
4  *   Copyright (c) Intel Corporation.
5  *   All rights reserved.
6  *
7  *   Redistribution and use in source and binary forms, with or without
8  *   modification, are permitted provided that the following conditions
9  *   are met:
10  *
11  *     * Redistributions of source code must retain the above copyright
12  *       notice, this list of conditions and the following disclaimer.
13  *     * Redistributions in binary form must reproduce the above copyright
14  *       notice, this list of conditions and the following disclaimer in
15  *       the documentation and/or other materials provided with the
16  *       distribution.
17  *     * Neither the name of Intel Corporation nor the names of its
18  *       contributors may be used to endorse or promote products derived
19  *       from this software without specific prior written permission.
20  *
21  *   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
22  *   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
23  *   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
24  *   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
25  *   OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
26  *   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
27  *   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
28  *   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
29  *   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
30  *   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
31  *   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
32  */
33 
34 #ifndef SPDK_UTF_H_
35 #define SPDK_UTF_H_
36 
37 #include "spdk/stdinc.h"
38 
39 #include "spdk/endian.h"
40 #include "spdk/likely.h"
41 #include "spdk/string.h"
42 
43 static inline bool
44 utf8_tail(uint8_t c)
45 {
46 	/* c >= 0x80 && c <= 0xBF, or binary 01xxxxxx */
47 	return (c & 0xC0) == 0x80;
48 }
49 
50 /*
51  * Check for a valid UTF-8 encoding of a single codepoint.
52  *
53  * \return Length of valid UTF-8 byte sequence, or negative if invalid.
54  */
55 static inline int
56 utf8_valid(const uint8_t *start, const uint8_t *end)
57 {
58 	const uint8_t *p = start;
59 	uint8_t b0, b1, b2, b3;
60 
61 	if (p == end) {
62 		return 0;
63 	}
64 
65 	b0 = *p;
66 
67 	if (b0 <= 0x7F) {
68 		return 1;
69 	}
70 
71 	if (b0 <= 0xC1) {
72 		/* Invalid start byte */
73 		return -1;
74 	}
75 
76 	if (++p == end) {
77 		/* Not enough bytes left */
78 		return -1;
79 	}
80 	b1 = *p;
81 
82 	if (b0 <= 0xDF) {
83 		/* C2..DF 80..BF */
84 		if (!utf8_tail(b1)) {
85 			return -1;
86 		}
87 		return 2;
88 	}
89 
90 	if (++p == end) {
91 		/* Not enough bytes left */
92 		return -1;
93 	}
94 	b2 = *p;
95 
96 	if (b0 == 0xE0) {
97 		/* E0 A0..BF 80..BF */
98 		if (b1 < 0xA0 || b1 > 0xBF || !utf8_tail(b2)) {
99 			return -1;
100 		}
101 		return 3;
102 	} else if (b0 == 0xED && b1 >= 0xA0) {
103 		/*
104 		 * UTF-16 surrogate pairs use U+D800..U+DFFF, which would be encoded as
105 		 * ED A0..BF 80..BF in UTF-8; however, surrogate pairs are not allowed in UTF-8.
106 		 */
107 		return -1;
108 	} else if (b0 <= 0xEF) {
109 		/* E1..EF 80..BF 80..BF */
110 		if (!utf8_tail(b1) || !utf8_tail(b2)) {
111 			return -1;
112 		}
113 		return 3;
114 	}
115 
116 	if (++p == end) {
117 		/* Not enough bytes left */
118 		return -1;
119 	}
120 	b3 = *p;
121 
122 	if (b0 == 0xF0) {
123 		/* F0 90..BF 80..BF 80..BF */
124 		if (b1 < 0x90 || b1 > 0xBF || !utf8_tail(b2) || !utf8_tail(b3)) {
125 			return -1;
126 		}
127 		return 4;
128 	} else if (b0 <= 0xF3) {
129 		/* F1..F3 80..BF 80..BF 80..BF */
130 		if (!utf8_tail(b1) || !utf8_tail(b2) || !utf8_tail(b3)) {
131 			return -1;
132 		}
133 		return 4;
134 	} else if (b0 == 0xF4) {
135 		/* F4 80..8F 80..BF 80..BF */
136 		if (b1 < 0x80 || b1 > 0x8F || !utf8_tail(b2) || !utf8_tail(b3)) {
137 			return -1;
138 		}
139 		return 4;
140 	}
141 
142 	return -1;
143 }
144 
145 static inline uint32_t
146 utf8_decode_unsafe_1(const uint8_t *data)
147 {
148 	return data[0];
149 }
150 
151 static inline uint32_t
152 utf8_decode_unsafe_2(const uint8_t *data)
153 {
154 	uint32_t codepoint;
155 
156 	codepoint = ((data[0] & 0x1F) << 6);
157 	codepoint |= (data[1] & 0x3F);
158 
159 	return codepoint;
160 }
161 
162 static inline uint32_t
163 utf8_decode_unsafe_3(const uint8_t *data)
164 {
165 	uint32_t codepoint;
166 
167 	codepoint = ((data[0] & 0x0F) << 12);
168 	codepoint |= (data[1] & 0x3F) << 6;
169 	codepoint |= (data[2] & 0x3F);
170 
171 	return codepoint;
172 }
173 
174 static inline uint32_t
175 utf8_decode_unsafe_4(const uint8_t *data)
176 {
177 	uint32_t codepoint;
178 
179 	codepoint = ((data[0] & 0x07) << 18);
180 	codepoint |= (data[1] & 0x3F) << 12;
181 	codepoint |= (data[2] & 0x3F) << 6;
182 	codepoint |= (data[3] & 0x3F);
183 
184 	return codepoint;
185 }
186 
187 /*
188  * Encode a single Unicode codepoint as UTF-8.
189  *
190  * buf must have at least 4 bytes of space available (hence unsafe).
191  *
192  * \return Number of bytes appended to buf, or negative if encoding failed.
193  */
194 static inline int
195 utf8_encode_unsafe(uint8_t *buf, uint32_t c)
196 {
197 	if (c <= 0x7F) {
198 		buf[0] = c;
199 		return 1;
200 	} else if (c <= 0x7FF) {
201 		buf[0] = 0xC0 | (c >> 6);
202 		buf[1] = 0x80 | (c & 0x3F);
203 		return 2;
204 	} else if (c >= 0xD800 && c <= 0xDFFF) {
205 		/* UTF-16 surrogate pairs - invalid in UTF-8 */
206 		return -1;
207 	} else if (c <= 0xFFFF) {
208 		buf[0] = 0xE0 | (c >> 12);
209 		buf[1] = 0x80 | ((c >> 6) & 0x3F);
210 		buf[2] = 0x80 | (c & 0x3F);
211 		return 3;
212 	} else if (c <= 0x10FFFF) {
213 		buf[0] = 0xF0 | (c >> 18);
214 		buf[1] = 0x80 | ((c >> 12) & 0x3F);
215 		buf[2] = 0x80 | ((c >> 6) & 0x3F);
216 		buf[3] = 0x80 | (c & 0x3F);
217 		return 4;
218 	}
219 	return -1;
220 }
221 
222 static inline int
223 utf8_codepoint_len(uint32_t c)
224 {
225 	if (c <= 0x7F) {
226 		return 1;
227 	} else if (c <= 0x7FF) {
228 		return 2;
229 	} else if (c >= 0xD800 && c <= 0xDFFF) {
230 		/* UTF-16 surrogate pairs - invalid in UTF-8 */
231 		return -1;
232 	} else if (c <= 0xFFFF) {
233 		return 3;
234 	} else if (c <= 0x10FFFF) {
235 		return 4;
236 	}
237 	return -1;
238 }
239 
240 static inline bool
241 utf16_valid_surrogate_high(uint32_t val)
242 {
243 	return val >= 0xD800 && val <= 0xDBFF;
244 }
245 
246 static inline bool
247 utf16_valid_surrogate_low(uint32_t val)
248 {
249 	return val >= 0xDC00 && val <= 0xDFFF;
250 }
251 
252 /*
253  * Check for a valid UTF-16LE encoding of a single codepoint.
254  *
255  * \return Length of valid UTF-16LE sequence in 16-bit code units, or negative if invalid.
256  */
257 static inline int
258 utf16le_valid(const uint16_t *start, const uint16_t *end)
259 {
260 	const uint16_t *p = start;
261 	uint16_t high, low;
262 
263 	if (p == end) {
264 		return 0;
265 	}
266 
267 	high = from_le16(p);
268 
269 	if (high <= 0xD7FF || high >= 0xE000) {
270 		/* Single code unit in BMP */
271 		return 1;
272 	}
273 
274 	if (high >= 0xDC00) {
275 		/* Low surrogate in first code unit - invalid */
276 		return -1;
277 	}
278 
279 	assert(utf16_valid_surrogate_high(high));
280 
281 	if (++p == end) {
282 		/* Not enough code units left */
283 		return -1;
284 	}
285 	low = from_le16(p);
286 
287 	if (!utf16_valid_surrogate_low(low)) {
288 		return -1;
289 	}
290 
291 	/* Valid surrogate pair */
292 	return 2;
293 }
294 
295 static inline uint32_t
296 utf16_decode_surrogate_pair(uint32_t high, uint32_t low)
297 {
298 	uint32_t codepoint;
299 
300 	assert(utf16_valid_surrogate_high(high));
301 	assert(utf16_valid_surrogate_low(low));
302 
303 	codepoint = low;
304 	codepoint &= 0x3FF;
305 	codepoint |= ((high & 0x3FF) << 10);
306 	codepoint += 0x10000;
307 
308 	return codepoint;
309 }
310 
311 static inline void
312 utf16_encode_surrogate_pair(uint32_t codepoint, uint16_t *high, uint16_t *low)
313 {
314 	assert(codepoint >= 0x10000);
315 	assert(codepoint <= 0x10FFFF);
316 
317 	codepoint -= 0x10000;
318 	*high = 0xD800 | (codepoint >> 10);
319 	*low = 0xDC00 | (codepoint & 0x3FF);
320 
321 	assert(utf16_valid_surrogate_high(*high));
322 	assert(utf16_valid_surrogate_low(*low));
323 }
324 
325 #endif
326