1 /* SPDX-License-Identifier: BSD-3-Clause
2 * Copyright (C) 2016 Intel Corporation.
3 * All rights reserved.
4 */
5
6 #ifndef SPDK_UTF_H_
7 #define SPDK_UTF_H_
8
9 #include "spdk/stdinc.h"
10
11 #include "spdk/endian.h"
12 #include "spdk/likely.h"
13 #include "spdk/string.h"
14
15 static inline bool
utf8_tail(uint8_t c)16 utf8_tail(uint8_t c)
17 {
18 /* c >= 0x80 && c <= 0xBF, or binary 01xxxxxx */
19 return (c & 0xC0) == 0x80;
20 }
21
22 /*
23 * Check for a valid UTF-8 encoding of a single codepoint.
24 *
25 * \return Length of valid UTF-8 byte sequence, or negative if invalid.
26 */
27 static inline int
utf8_valid(const uint8_t * start,const uint8_t * end)28 utf8_valid(const uint8_t *start, const uint8_t *end)
29 {
30 const uint8_t *p = start;
31 uint8_t b0, b1, b2, b3;
32
33 if (p == end) {
34 return 0;
35 }
36
37 b0 = *p;
38
39 if (b0 <= 0x7F) {
40 return 1;
41 }
42
43 if (b0 <= 0xC1) {
44 /* Invalid start byte */
45 return -1;
46 }
47
48 if (++p == end) {
49 /* Not enough bytes left */
50 return -1;
51 }
52 b1 = *p;
53
54 if (b0 <= 0xDF) {
55 /* C2..DF 80..BF */
56 if (!utf8_tail(b1)) {
57 return -1;
58 }
59 return 2;
60 }
61
62 if (++p == end) {
63 /* Not enough bytes left */
64 return -1;
65 }
66 b2 = *p;
67
68 if (b0 == 0xE0) {
69 /* E0 A0..BF 80..BF */
70 if (b1 < 0xA0 || b1 > 0xBF || !utf8_tail(b2)) {
71 return -1;
72 }
73 return 3;
74 } else if (b0 == 0xED && b1 >= 0xA0) {
75 /*
76 * UTF-16 surrogate pairs use U+D800..U+DFFF, which would be encoded as
77 * ED A0..BF 80..BF in UTF-8; however, surrogate pairs are not allowed in UTF-8.
78 */
79 return -1;
80 } else if (b0 <= 0xEF) {
81 /* E1..EF 80..BF 80..BF */
82 if (!utf8_tail(b1) || !utf8_tail(b2)) {
83 return -1;
84 }
85 return 3;
86 }
87
88 if (++p == end) {
89 /* Not enough bytes left */
90 return -1;
91 }
92 b3 = *p;
93
94 if (b0 == 0xF0) {
95 /* F0 90..BF 80..BF 80..BF */
96 if (b1 < 0x90 || b1 > 0xBF || !utf8_tail(b2) || !utf8_tail(b3)) {
97 return -1;
98 }
99 return 4;
100 } else if (b0 <= 0xF3) {
101 /* F1..F3 80..BF 80..BF 80..BF */
102 if (!utf8_tail(b1) || !utf8_tail(b2) || !utf8_tail(b3)) {
103 return -1;
104 }
105 return 4;
106 } else if (b0 == 0xF4) {
107 /* F4 80..8F 80..BF 80..BF */
108 if (b1 < 0x80 || b1 > 0x8F || !utf8_tail(b2) || !utf8_tail(b3)) {
109 return -1;
110 }
111 return 4;
112 }
113
114 return -1;
115 }
116
117 static inline uint32_t
utf8_decode_unsafe_1(const uint8_t * data)118 utf8_decode_unsafe_1(const uint8_t *data)
119 {
120 return data[0];
121 }
122
123 static inline uint32_t
utf8_decode_unsafe_2(const uint8_t * data)124 utf8_decode_unsafe_2(const uint8_t *data)
125 {
126 uint32_t codepoint;
127
128 codepoint = ((data[0] & 0x1F) << 6);
129 codepoint |= (data[1] & 0x3F);
130
131 return codepoint;
132 }
133
134 static inline uint32_t
utf8_decode_unsafe_3(const uint8_t * data)135 utf8_decode_unsafe_3(const uint8_t *data)
136 {
137 uint32_t codepoint;
138
139 codepoint = ((data[0] & 0x0F) << 12);
140 codepoint |= (data[1] & 0x3F) << 6;
141 codepoint |= (data[2] & 0x3F);
142
143 return codepoint;
144 }
145
146 static inline uint32_t
utf8_decode_unsafe_4(const uint8_t * data)147 utf8_decode_unsafe_4(const uint8_t *data)
148 {
149 uint32_t codepoint;
150
151 codepoint = ((data[0] & 0x07) << 18);
152 codepoint |= (data[1] & 0x3F) << 12;
153 codepoint |= (data[2] & 0x3F) << 6;
154 codepoint |= (data[3] & 0x3F);
155
156 return codepoint;
157 }
158
159 /*
160 * Encode a single Unicode codepoint as UTF-8.
161 *
162 * buf must have at least 4 bytes of space available (hence unsafe).
163 *
164 * \return Number of bytes appended to buf, or negative if encoding failed.
165 */
166 static inline int
utf8_encode_unsafe(uint8_t * buf,uint32_t c)167 utf8_encode_unsafe(uint8_t *buf, uint32_t c)
168 {
169 if (c <= 0x7F) {
170 buf[0] = c;
171 return 1;
172 } else if (c <= 0x7FF) {
173 buf[0] = 0xC0 | (c >> 6);
174 buf[1] = 0x80 | (c & 0x3F);
175 return 2;
176 } else if (c >= 0xD800 && c <= 0xDFFF) {
177 /* UTF-16 surrogate pairs - invalid in UTF-8 */
178 return -1;
179 } else if (c <= 0xFFFF) {
180 buf[0] = 0xE0 | (c >> 12);
181 buf[1] = 0x80 | ((c >> 6) & 0x3F);
182 buf[2] = 0x80 | (c & 0x3F);
183 return 3;
184 } else if (c <= 0x10FFFF) {
185 buf[0] = 0xF0 | (c >> 18);
186 buf[1] = 0x80 | ((c >> 12) & 0x3F);
187 buf[2] = 0x80 | ((c >> 6) & 0x3F);
188 buf[3] = 0x80 | (c & 0x3F);
189 return 4;
190 }
191 return -1;
192 }
193
194 static inline int
utf8_codepoint_len(uint32_t c)195 utf8_codepoint_len(uint32_t c)
196 {
197 if (c <= 0x7F) {
198 return 1;
199 } else if (c <= 0x7FF) {
200 return 2;
201 } else if (c >= 0xD800 && c <= 0xDFFF) {
202 /* UTF-16 surrogate pairs - invalid in UTF-8 */
203 return -1;
204 } else if (c <= 0xFFFF) {
205 return 3;
206 } else if (c <= 0x10FFFF) {
207 return 4;
208 }
209 return -1;
210 }
211
212 static inline bool
utf16_valid_surrogate_high(uint32_t val)213 utf16_valid_surrogate_high(uint32_t val)
214 {
215 return val >= 0xD800 && val <= 0xDBFF;
216 }
217
218 static inline bool
utf16_valid_surrogate_low(uint32_t val)219 utf16_valid_surrogate_low(uint32_t val)
220 {
221 return val >= 0xDC00 && val <= 0xDFFF;
222 }
223
224 /*
225 * Check for a valid UTF-16LE encoding of a single codepoint.
226 *
227 * \return Length of valid UTF-16LE sequence in 16-bit code units, or negative if invalid.
228 */
229 static inline int
utf16le_valid(const uint16_t * start,const uint16_t * end)230 utf16le_valid(const uint16_t *start, const uint16_t *end)
231 {
232 const uint16_t *p = start;
233 uint16_t high, low;
234
235 if (p == end) {
236 return 0;
237 }
238
239 high = from_le16(p);
240
241 if (high <= 0xD7FF || high >= 0xE000) {
242 /* Single code unit in BMP */
243 return 1;
244 }
245
246 if (high >= 0xDC00) {
247 /* Low surrogate in first code unit - invalid */
248 return -1;
249 }
250
251 assert(utf16_valid_surrogate_high(high));
252
253 if (++p == end) {
254 /* Not enough code units left */
255 return -1;
256 }
257 low = from_le16(p);
258
259 if (!utf16_valid_surrogate_low(low)) {
260 return -1;
261 }
262
263 /* Valid surrogate pair */
264 return 2;
265 }
266
267 static inline uint32_t
utf16_decode_surrogate_pair(uint32_t high,uint32_t low)268 utf16_decode_surrogate_pair(uint32_t high, uint32_t low)
269 {
270 uint32_t codepoint;
271
272 assert(utf16_valid_surrogate_high(high));
273 assert(utf16_valid_surrogate_low(low));
274
275 codepoint = low;
276 codepoint &= 0x3FF;
277 codepoint |= ((high & 0x3FF) << 10);
278 codepoint += 0x10000;
279
280 return codepoint;
281 }
282
283 static inline void
utf16_encode_surrogate_pair(uint32_t codepoint,uint16_t * high,uint16_t * low)284 utf16_encode_surrogate_pair(uint32_t codepoint, uint16_t *high, uint16_t *low)
285 {
286 assert(codepoint >= 0x10000);
287 assert(codepoint <= 0x10FFFF);
288
289 codepoint -= 0x10000;
290 *high = 0xD800 | (codepoint >> 10);
291 *low = 0xDC00 | (codepoint & 0x3FF);
292
293 assert(utf16_valid_surrogate_high(*high));
294 assert(utf16_valid_surrogate_low(*low));
295 }
296
297 #endif
298