include/spdk_internal/utf.h

/*-
 *   BSD LICENSE
 *
 *   Copyright (c) Intel Corporation.
 *   All rights reserved.
 *
 *   Redistribution and use in source and binary forms, with or without
 *   modification, are permitted provided that the following conditions
 *   are met:
 *
 *     * Redistributions of source code must retain the above copyright
 *       notice, this list of conditions and the following disclaimer.
 *     * Redistributions in binary form must reproduce the above copyright
 *       notice, this list of conditions and the following disclaimer in
 *       the documentation and/or other materials provided with the
 *       distribution.
 *     * Neither the name of Intel Corporation nor the names of its
 *       contributors may be used to endorse or promote products derived
 *       from this software without specific prior written permission.
 *
 *   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
 *   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
 *   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
 *   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
 *   OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
 *   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
 *   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
 *   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
 *   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
 *   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
 *   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 */

#ifndef SPDK_UTF_H_
#define SPDK_UTF_H_

#include "spdk/stdinc.h"

#include "spdk/endian.h"
#include "spdk/likely.h"
#include "spdk/string.h"

static inline bool
utf8_tail(uint8_t c)
{
	/* c >= 0x80 && c <= 0xBF, or binary 01xxxxxx */
	return (c & 0xC0) == 0x80;
}

/*
 * Check for a valid UTF-8 encoding of a single codepoint.
 *
 * \return Length of valid UTF-8 byte sequence, or negative if invalid.
 */
static inline int
utf8_valid(const uint8_t *start, const uint8_t *end)
{
	const uint8_t *p = start;
	uint8_t b0, b1, b2, b3;

	if (p == end) {
		return 0;
	}

	b0 = *p;

	if (b0 <= 0x7F) {
		return 1;
	}

	if (b0 <= 0xC1) {
		/* Invalid start byte */
		return -1;
	}

	if (++p == end) {
		/* Not enough bytes left */
		return -1;
	}
	b1 = *p;

	if (b0 <= 0xDF) {
		/* C2..DF 80..BF */
		if (!utf8_tail(b1)) {
			return -1;
		}
		return 2;
	}

	if (++p == end) {
		/* Not enough bytes left */
		return -1;
	}
	b2 = *p;

	if (b0 == 0xE0) {
		/* E0 A0..BF 80..BF */
		if (b1 < 0xA0 || b1 > 0xBF || !utf8_tail(b2)) {
			return -1;
		}
		return 3;
	} else if (b0 == 0xED && b1 >= 0xA0) {
		/*
		 * UTF-16 surrogate pairs use U+D800..U+DFFF, which would be encoded as
		 * ED A0..BF 80..BF in UTF-8; however, surrogate pairs are not allowed in UTF-8.
		 */
		return -1;
	} else if (b0 <= 0xEF) {
		/* E1..EF 80..BF 80..BF */
		if (!utf8_tail(b1) || !utf8_tail(b2)) {
			return -1;
		}
		return 3;
	}

	if (++p == end) {
		/* Not enough bytes left */
		return -1;
	}
	b3 = *p;

	if (b0 == 0xF0) {
		/* F0 90..BF 80..BF 80..BF */
		if (b1 < 0x90 || b1 > 0xBF || !utf8_tail(b2) || !utf8_tail(b3)) {
			return -1;
		}
		return 4;
	} else if (b0 <= 0xF3) {
		/* F1..F3 80..BF 80..BF 80..BF */
		if (!utf8_tail(b1) || !utf8_tail(b2) || !utf8_tail(b3)) {
			return -1;
		}
		return 4;
	} else if (b0 == 0xF4) {
		/* F4 80..8F 80..BF 80..BF */
		if (b1 < 0x80 || b1 > 0x8F || !utf8_tail(b2) || !utf8_tail(b3)) {
			return -1;
		}
		return 4;
	}

	return -1;
}

static inline uint32_t
utf8_decode_unsafe_1(const uint8_t *data)
{
	return data[0];
}

static inline uint32_t
utf8_decode_unsafe_2(const uint8_t *data)
{
	uint32_t codepoint;

	codepoint = ((data[0] & 0x1F) << 6);
	codepoint |= (data[1] & 0x3F);

	return codepoint;
}

static inline uint32_t
utf8_decode_unsafe_3(const uint8_t *data)
{
	uint32_t codepoint;

	codepoint = ((data[0] & 0x0F) << 12);
	codepoint |= (data[1] & 0x3F) << 6;
	codepoint |= (data[2] & 0x3F);

	return codepoint;
}

static inline uint32_t
utf8_decode_unsafe_4(const uint8_t *data)
{
	uint32_t codepoint;

	codepoint = ((data[0] & 0x07) << 18);
	codepoint |= (data[1] & 0x3F) << 12;
	codepoint |= (data[2] & 0x3F) << 6;
	codepoint |= (data[3] & 0x3F);

	return codepoint;
}

/*
 * Encode a single Unicode codepoint as UTF-8.
 *
 * buf must have at least 4 bytes of space available (hence unsafe).
 *
 * \return Number of bytes appended to buf, or negative if encoding failed.
 */
static inline int
utf8_encode_unsafe(uint8_t *buf, uint32_t c)
{
	if (c <= 0x7F) {
		buf[0] = c;
		return 1;
	} else if (c <= 0x7FF) {
		buf[0] = 0xC0 | (c >> 6);
		buf[1] = 0x80 | (c & 0x3F);
		return 2;
	} else if (c >= 0xD800 && c <= 0xDFFF) {
		/* UTF-16 surrogate pairs - invalid in UTF-8 */
		return -1;
	} else if (c <= 0xFFFF) {
		buf[0] = 0xE0 | (c >> 12);
		buf[1] = 0x80 | ((c >> 6) & 0x3F);
		buf[2] = 0x80 | (c & 0x3F);
		return 3;
	} else if (c <= 0x10FFFF) {
		buf[0] = 0xF0 | (c >> 18);
		buf[1] = 0x80 | ((c >> 12) & 0x3F);
		buf[2] = 0x80 | ((c >> 6) & 0x3F);
		buf[3] = 0x80 | (c & 0x3F);
		return 4;
	}
	return -1;
}

static inline int
utf8_codepoint_len(uint32_t c)
{
	if (c <= 0x7F) {
		return 1;
	} else if (c <= 0x7FF) {
		return 2;
	} else if (c >= 0xD800 && c <= 0xDFFF) {
		/* UTF-16 surrogate pairs - invalid in UTF-8 */
		return -1;
	} else if (c <= 0xFFFF) {
		return 3;
	} else if (c <= 0x10FFFF) {
		return 4;
	}
	return -1;
}

static inline bool
utf16_valid_surrogate_high(uint32_t val)
{
	return val >= 0xD800 && val <= 0xDBFF;
}

static inline bool
utf16_valid_surrogate_low(uint32_t val)
{
	return val >= 0xDC00 && val <= 0xDFFF;
}

/*
 * Check for a valid UTF-16LE encoding of a single codepoint.
 *
 * \return Length of valid UTF-16LE sequence in 16-bit code units, or negative if invalid.
 */
static inline int
utf16le_valid(const uint16_t *start, const uint16_t *end)
{
	const uint16_t *p = start;
	uint16_t high, low;

	if (p == end) {
		return 0;
	}

	high = from_le16(p);

	if (high <= 0xD7FF || high >= 0xE000) {
		/* Single code unit in BMP */
		return 1;
	}

	if (high >= 0xDC00) {
		/* Low surrogate in first code unit - invalid */
		return -1;
	}

	assert(utf16_valid_surrogate_high(high));

	if (++p == end) {
		/* Not enough code units left */
		return -1;
	}
	low = from_le16(p);

	if (!utf16_valid_surrogate_low(low)) {
		return -1;
	}

	/* Valid surrogate pair */
	return 2;
}

static inline uint32_t
utf16_decode_surrogate_pair(uint32_t high, uint32_t low)
{
	uint32_t codepoint;

	assert(utf16_valid_surrogate_high(high));
	assert(utf16_valid_surrogate_low(low));

	codepoint = low;
	codepoint &= 0x3FF;
	codepoint |= ((high & 0x3FF) << 10);
	codepoint += 0x10000;

	return codepoint;
}

static inline void
utf16_encode_surrogate_pair(uint32_t codepoint, uint16_t *high, uint16_t *low)
{
	assert(codepoint >= 0x10000);
	assert(codepoint <= 0x10FFFF);

	codepoint -= 0x10000;
	*high = 0xD800 | (codepoint >> 10);
	*low = 0xDC00 | (codepoint & 0x3FF);

	assert(utf16_valid_surrogate_high(*high));
	assert(utf16_valid_surrogate_low(*low));
}

#endif