1 /*- 2 * BSD LICENSE 3 * 4 * Copyright (c) Intel Corporation. 5 * All rights reserved. 6 * 7 * Redistribution and use in source and binary forms, with or without 8 * modification, are permitted provided that the following conditions 9 * are met: 10 * 11 * * Redistributions of source code must retain the above copyright 12 * notice, this list of conditions and the following disclaimer. 13 * * Redistributions in binary form must reproduce the above copyright 14 * notice, this list of conditions and the following disclaimer in 15 * the documentation and/or other materials provided with the 16 * distribution. 17 * * Neither the name of Intel Corporation nor the names of its 18 * contributors may be used to endorse or promote products derived 19 * from this software without specific prior written permission. 20 * 21 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS 22 * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT 23 * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR 24 * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT 25 * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, 26 * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT 27 * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, 28 * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY 29 * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 30 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE 31 * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 32 */ 33 34 #ifndef SPDK_UTF_H_ 35 #define SPDK_UTF_H_ 36 37 #include "spdk/stdinc.h" 38 39 #include "spdk/endian.h" 40 #include "spdk/likely.h" 41 #include "spdk/string.h" 42 43 static inline bool 44 utf8_tail(uint8_t c) 45 { 46 /* c >= 0x80 && c <= 0xBF, or binary 01xxxxxx */ 47 return (c & 0xC0) == 0x80; 48 } 49 50 /* 51 * Check for a valid UTF-8 encoding of a single codepoint. 52 * 53 * \return Length of valid UTF-8 byte sequence, or negative if invalid. 54 */ 55 static inline int 56 utf8_valid(const uint8_t *start, const uint8_t *end) 57 { 58 const uint8_t *p = start; 59 uint8_t b0, b1, b2, b3; 60 61 if (p == end) { 62 return 0; 63 } 64 65 b0 = *p; 66 67 if (b0 <= 0x7F) { 68 return 1; 69 } 70 71 if (b0 <= 0xC1) { 72 /* Invalid start byte */ 73 return -1; 74 } 75 76 if (++p == end) { 77 /* Not enough bytes left */ 78 return -1; 79 } 80 b1 = *p; 81 82 if (b0 <= 0xDF) { 83 /* C2..DF 80..BF */ 84 if (!utf8_tail(b1)) { 85 return -1; 86 } 87 return 2; 88 } 89 90 if (++p == end) { 91 /* Not enough bytes left */ 92 return -1; 93 } 94 b2 = *p; 95 96 if (b0 == 0xE0) { 97 /* E0 A0..BF 80..BF */ 98 if (b1 < 0xA0 || b1 > 0xBF || !utf8_tail(b2)) { 99 return -1; 100 } 101 return 3; 102 } else if (b0 == 0xED && b1 >= 0xA0) { 103 /* 104 * UTF-16 surrogate pairs use U+D800..U+DFFF, which would be encoded as 105 * ED A0..BF 80..BF in UTF-8; however, surrogate pairs are not allowed in UTF-8. 106 */ 107 return -1; 108 } else if (b0 <= 0xEF) { 109 /* E1..EF 80..BF 80..BF */ 110 if (!utf8_tail(b1) || !utf8_tail(b2)) { 111 return -1; 112 } 113 return 3; 114 } 115 116 if (++p == end) { 117 /* Not enough bytes left */ 118 return -1; 119 } 120 b3 = *p; 121 122 if (b0 == 0xF0) { 123 /* F0 90..BF 80..BF 80..BF */ 124 if (b1 < 0x90 || b1 > 0xBF || !utf8_tail(b2) || !utf8_tail(b3)) { 125 return -1; 126 } 127 return 4; 128 } else if (b0 <= 0xF3) { 129 /* F1..F3 80..BF 80..BF 80..BF */ 130 if (!utf8_tail(b1) || !utf8_tail(b2) || !utf8_tail(b3)) { 131 return -1; 132 } 133 return 4; 134 } else if (b0 == 0xF4) { 135 /* F4 80..8F 80..BF 80..BF */ 136 if (b1 < 0x80 || b1 > 0x8F || !utf8_tail(b2) || !utf8_tail(b3)) { 137 return -1; 138 } 139 return 4; 140 } 141 142 return -1; 143 } 144 145 static inline uint32_t 146 utf8_decode_unsafe_1(const uint8_t *data) 147 { 148 return data[0]; 149 } 150 151 static inline uint32_t 152 utf8_decode_unsafe_2(const uint8_t *data) 153 { 154 uint32_t codepoint; 155 156 codepoint = ((data[0] & 0x1F) << 6); 157 codepoint |= (data[1] & 0x3F); 158 159 return codepoint; 160 } 161 162 static inline uint32_t 163 utf8_decode_unsafe_3(const uint8_t *data) 164 { 165 uint32_t codepoint; 166 167 codepoint = ((data[0] & 0x0F) << 12); 168 codepoint |= (data[1] & 0x3F) << 6; 169 codepoint |= (data[2] & 0x3F); 170 171 return codepoint; 172 } 173 174 static inline uint32_t 175 utf8_decode_unsafe_4(const uint8_t *data) 176 { 177 uint32_t codepoint; 178 179 codepoint = ((data[0] & 0x07) << 18); 180 codepoint |= (data[1] & 0x3F) << 12; 181 codepoint |= (data[2] & 0x3F) << 6; 182 codepoint |= (data[3] & 0x3F); 183 184 return codepoint; 185 } 186 187 /* 188 * Encode a single Unicode codepoint as UTF-8. 189 * 190 * buf must have at least 4 bytes of space available (hence unsafe). 191 * 192 * \return Number of bytes appended to buf, or negative if encoding failed. 193 */ 194 static inline int 195 utf8_encode_unsafe(uint8_t *buf, uint32_t c) 196 { 197 if (c <= 0x7F) { 198 buf[0] = c; 199 return 1; 200 } else if (c <= 0x7FF) { 201 buf[0] = 0xC0 | (c >> 6); 202 buf[1] = 0x80 | (c & 0x3F); 203 return 2; 204 } else if (c >= 0xD800 && c <= 0xDFFF) { 205 /* UTF-16 surrogate pairs - invalid in UTF-8 */ 206 return -1; 207 } else if (c <= 0xFFFF) { 208 buf[0] = 0xE0 | (c >> 12); 209 buf[1] = 0x80 | ((c >> 6) & 0x3F); 210 buf[2] = 0x80 | (c & 0x3F); 211 return 3; 212 } else if (c <= 0x10FFFF) { 213 buf[0] = 0xF0 | (c >> 18); 214 buf[1] = 0x80 | ((c >> 12) & 0x3F); 215 buf[2] = 0x80 | ((c >> 6) & 0x3F); 216 buf[3] = 0x80 | (c & 0x3F); 217 return 4; 218 } 219 return -1; 220 } 221 222 static inline int 223 utf8_codepoint_len(uint32_t c) 224 { 225 if (c <= 0x7F) { 226 return 1; 227 } else if (c <= 0x7FF) { 228 return 2; 229 } else if (c >= 0xD800 && c <= 0xDFFF) { 230 /* UTF-16 surrogate pairs - invalid in UTF-8 */ 231 return -1; 232 } else if (c <= 0xFFFF) { 233 return 3; 234 } else if (c <= 0x10FFFF) { 235 return 4; 236 } 237 return -1; 238 } 239 240 static inline bool 241 utf16_valid_surrogate_high(uint32_t val) 242 { 243 return val >= 0xD800 && val <= 0xDBFF; 244 } 245 246 static inline bool 247 utf16_valid_surrogate_low(uint32_t val) 248 { 249 return val >= 0xDC00 && val <= 0xDFFF; 250 } 251 252 /* 253 * Check for a valid UTF-16LE encoding of a single codepoint. 254 * 255 * \return Length of valid UTF-16LE sequence in 16-bit code units, or negative if invalid. 256 */ 257 static inline int 258 utf16le_valid(const uint16_t *start, const uint16_t *end) 259 { 260 const uint16_t *p = start; 261 uint16_t high, low; 262 263 if (p == end) { 264 return 0; 265 } 266 267 high = from_le16(p); 268 269 if (high <= 0xD7FF || high >= 0xE000) { 270 /* Single code unit in BMP */ 271 return 1; 272 } 273 274 if (high >= 0xDC00) { 275 /* Low surrogate in first code unit - invalid */ 276 return -1; 277 } 278 279 assert(utf16_valid_surrogate_high(high)); 280 281 if (++p == end) { 282 /* Not enough code units left */ 283 return -1; 284 } 285 low = from_le16(p); 286 287 if (!utf16_valid_surrogate_low(low)) { 288 return -1; 289 } 290 291 /* Valid surrogate pair */ 292 return 2; 293 } 294 295 static inline uint32_t 296 utf16_decode_surrogate_pair(uint32_t high, uint32_t low) 297 { 298 uint32_t codepoint; 299 300 assert(utf16_valid_surrogate_high(high)); 301 assert(utf16_valid_surrogate_low(low)); 302 303 codepoint = low; 304 codepoint &= 0x3FF; 305 codepoint |= ((high & 0x3FF) << 10); 306 codepoint += 0x10000; 307 308 return codepoint; 309 } 310 311 static inline void 312 utf16_encode_surrogate_pair(uint32_t codepoint, uint16_t *high, uint16_t *low) 313 { 314 assert(codepoint >= 0x10000); 315 assert(codepoint <= 0x10FFFF); 316 317 codepoint -= 0x10000; 318 *high = 0xD800 | (codepoint >> 10); 319 *low = 0xDC00 | (codepoint & 0x3FF); 320 321 assert(utf16_valid_surrogate_high(*high)); 322 assert(utf16_valid_surrogate_low(*low)); 323 } 324 325 #endif 326