1 /* SPDX-License-Identifier: BSD-3-Clause 2 * Copyright (C) 2016 Intel Corporation. 3 * All rights reserved. 4 */ 5 6 #ifndef SPDK_UTF_H_ 7 #define SPDK_UTF_H_ 8 9 #include "spdk/stdinc.h" 10 11 #include "spdk/endian.h" 12 #include "spdk/likely.h" 13 #include "spdk/string.h" 14 15 static inline bool 16 utf8_tail(uint8_t c) 17 { 18 /* c >= 0x80 && c <= 0xBF, or binary 01xxxxxx */ 19 return (c & 0xC0) == 0x80; 20 } 21 22 /* 23 * Check for a valid UTF-8 encoding of a single codepoint. 24 * 25 * \return Length of valid UTF-8 byte sequence, or negative if invalid. 26 */ 27 static inline int 28 utf8_valid(const uint8_t *start, const uint8_t *end) 29 { 30 const uint8_t *p = start; 31 uint8_t b0, b1, b2, b3; 32 33 if (p == end) { 34 return 0; 35 } 36 37 b0 = *p; 38 39 if (b0 <= 0x7F) { 40 return 1; 41 } 42 43 if (b0 <= 0xC1) { 44 /* Invalid start byte */ 45 return -1; 46 } 47 48 if (++p == end) { 49 /* Not enough bytes left */ 50 return -1; 51 } 52 b1 = *p; 53 54 if (b0 <= 0xDF) { 55 /* C2..DF 80..BF */ 56 if (!utf8_tail(b1)) { 57 return -1; 58 } 59 return 2; 60 } 61 62 if (++p == end) { 63 /* Not enough bytes left */ 64 return -1; 65 } 66 b2 = *p; 67 68 if (b0 == 0xE0) { 69 /* E0 A0..BF 80..BF */ 70 if (b1 < 0xA0 || b1 > 0xBF || !utf8_tail(b2)) { 71 return -1; 72 } 73 return 3; 74 } else if (b0 == 0xED && b1 >= 0xA0) { 75 /* 76 * UTF-16 surrogate pairs use U+D800..U+DFFF, which would be encoded as 77 * ED A0..BF 80..BF in UTF-8; however, surrogate pairs are not allowed in UTF-8. 78 */ 79 return -1; 80 } else if (b0 <= 0xEF) { 81 /* E1..EF 80..BF 80..BF */ 82 if (!utf8_tail(b1) || !utf8_tail(b2)) { 83 return -1; 84 } 85 return 3; 86 } 87 88 if (++p == end) { 89 /* Not enough bytes left */ 90 return -1; 91 } 92 b3 = *p; 93 94 if (b0 == 0xF0) { 95 /* F0 90..BF 80..BF 80..BF */ 96 if (b1 < 0x90 || b1 > 0xBF || !utf8_tail(b2) || !utf8_tail(b3)) { 97 return -1; 98 } 99 return 4; 100 } else if (b0 <= 0xF3) { 101 /* F1..F3 80..BF 80..BF 80..BF */ 102 if (!utf8_tail(b1) || !utf8_tail(b2) || !utf8_tail(b3)) { 103 return -1; 104 } 105 return 4; 106 } else if (b0 == 0xF4) { 107 /* F4 80..8F 80..BF 80..BF */ 108 if (b1 < 0x80 || b1 > 0x8F || !utf8_tail(b2) || !utf8_tail(b3)) { 109 return -1; 110 } 111 return 4; 112 } 113 114 return -1; 115 } 116 117 static inline uint32_t 118 utf8_decode_unsafe_1(const uint8_t *data) 119 { 120 return data[0]; 121 } 122 123 static inline uint32_t 124 utf8_decode_unsafe_2(const uint8_t *data) 125 { 126 uint32_t codepoint; 127 128 codepoint = ((data[0] & 0x1F) << 6); 129 codepoint |= (data[1] & 0x3F); 130 131 return codepoint; 132 } 133 134 static inline uint32_t 135 utf8_decode_unsafe_3(const uint8_t *data) 136 { 137 uint32_t codepoint; 138 139 codepoint = ((data[0] & 0x0F) << 12); 140 codepoint |= (data[1] & 0x3F) << 6; 141 codepoint |= (data[2] & 0x3F); 142 143 return codepoint; 144 } 145 146 static inline uint32_t 147 utf8_decode_unsafe_4(const uint8_t *data) 148 { 149 uint32_t codepoint; 150 151 codepoint = ((data[0] & 0x07) << 18); 152 codepoint |= (data[1] & 0x3F) << 12; 153 codepoint |= (data[2] & 0x3F) << 6; 154 codepoint |= (data[3] & 0x3F); 155 156 return codepoint; 157 } 158 159 /* 160 * Encode a single Unicode codepoint as UTF-8. 161 * 162 * buf must have at least 4 bytes of space available (hence unsafe). 163 * 164 * \return Number of bytes appended to buf, or negative if encoding failed. 165 */ 166 static inline int 167 utf8_encode_unsafe(uint8_t *buf, uint32_t c) 168 { 169 if (c <= 0x7F) { 170 buf[0] = c; 171 return 1; 172 } else if (c <= 0x7FF) { 173 buf[0] = 0xC0 | (c >> 6); 174 buf[1] = 0x80 | (c & 0x3F); 175 return 2; 176 } else if (c >= 0xD800 && c <= 0xDFFF) { 177 /* UTF-16 surrogate pairs - invalid in UTF-8 */ 178 return -1; 179 } else if (c <= 0xFFFF) { 180 buf[0] = 0xE0 | (c >> 12); 181 buf[1] = 0x80 | ((c >> 6) & 0x3F); 182 buf[2] = 0x80 | (c & 0x3F); 183 return 3; 184 } else if (c <= 0x10FFFF) { 185 buf[0] = 0xF0 | (c >> 18); 186 buf[1] = 0x80 | ((c >> 12) & 0x3F); 187 buf[2] = 0x80 | ((c >> 6) & 0x3F); 188 buf[3] = 0x80 | (c & 0x3F); 189 return 4; 190 } 191 return -1; 192 } 193 194 static inline int 195 utf8_codepoint_len(uint32_t c) 196 { 197 if (c <= 0x7F) { 198 return 1; 199 } else if (c <= 0x7FF) { 200 return 2; 201 } else if (c >= 0xD800 && c <= 0xDFFF) { 202 /* UTF-16 surrogate pairs - invalid in UTF-8 */ 203 return -1; 204 } else if (c <= 0xFFFF) { 205 return 3; 206 } else if (c <= 0x10FFFF) { 207 return 4; 208 } 209 return -1; 210 } 211 212 static inline bool 213 utf16_valid_surrogate_high(uint32_t val) 214 { 215 return val >= 0xD800 && val <= 0xDBFF; 216 } 217 218 static inline bool 219 utf16_valid_surrogate_low(uint32_t val) 220 { 221 return val >= 0xDC00 && val <= 0xDFFF; 222 } 223 224 /* 225 * Check for a valid UTF-16LE encoding of a single codepoint. 226 * 227 * \return Length of valid UTF-16LE sequence in 16-bit code units, or negative if invalid. 228 */ 229 static inline int 230 utf16le_valid(const uint16_t *start, const uint16_t *end) 231 { 232 const uint16_t *p = start; 233 uint16_t high, low; 234 235 if (p == end) { 236 return 0; 237 } 238 239 high = from_le16(p); 240 241 if (high <= 0xD7FF || high >= 0xE000) { 242 /* Single code unit in BMP */ 243 return 1; 244 } 245 246 if (high >= 0xDC00) { 247 /* Low surrogate in first code unit - invalid */ 248 return -1; 249 } 250 251 assert(utf16_valid_surrogate_high(high)); 252 253 if (++p == end) { 254 /* Not enough code units left */ 255 return -1; 256 } 257 low = from_le16(p); 258 259 if (!utf16_valid_surrogate_low(low)) { 260 return -1; 261 } 262 263 /* Valid surrogate pair */ 264 return 2; 265 } 266 267 static inline uint32_t 268 utf16_decode_surrogate_pair(uint32_t high, uint32_t low) 269 { 270 uint32_t codepoint; 271 272 assert(utf16_valid_surrogate_high(high)); 273 assert(utf16_valid_surrogate_low(low)); 274 275 codepoint = low; 276 codepoint &= 0x3FF; 277 codepoint |= ((high & 0x3FF) << 10); 278 codepoint += 0x10000; 279 280 return codepoint; 281 } 282 283 static inline void 284 utf16_encode_surrogate_pair(uint32_t codepoint, uint16_t *high, uint16_t *low) 285 { 286 assert(codepoint >= 0x10000); 287 assert(codepoint <= 0x10FFFF); 288 289 codepoint -= 0x10000; 290 *high = 0xD800 | (codepoint >> 10); 291 *low = 0xDC00 | (codepoint & 0x3FF); 292 293 assert(utf16_valid_surrogate_high(*high)); 294 assert(utf16_valid_surrogate_low(*low)); 295 } 296 297 #endif 298