1*4afad4b7Schristos /* $NetBSD: utf8.c,v 1.1 2024/02/18 20:57:51 christos Exp $ */
2*4afad4b7Schristos
3*4afad4b7Schristos /*
4*4afad4b7Schristos * Copyright (C) Internet Systems Consortium, Inc. ("ISC")
5*4afad4b7Schristos *
6*4afad4b7Schristos * SPDX-License-Identifier: MPL-2.0
7*4afad4b7Schristos *
8*4afad4b7Schristos * This Source Code Form is subject to the terms of the Mozilla Public
9*4afad4b7Schristos * License, v. 2.0. If a copy of the MPL was not distributed with this
10*4afad4b7Schristos * file, you can obtain one at https://mozilla.org/MPL/2.0/.
11*4afad4b7Schristos *
12*4afad4b7Schristos * See the COPYRIGHT file distributed with this work for additional
13*4afad4b7Schristos * information regarding copyright ownership.
14*4afad4b7Schristos */
15*4afad4b7Schristos
16*4afad4b7Schristos #include <string.h>
17*4afad4b7Schristos
18*4afad4b7Schristos #include <isc/utf8.h>
19*4afad4b7Schristos #include <isc/util.h>
20*4afad4b7Schristos
21*4afad4b7Schristos /*
22*4afad4b7Schristos * UTF-8 is defined in "The Unicode Standard -- Version 4.0"
23*4afad4b7Schristos * Also see RFC 3629.
24*4afad4b7Schristos *
25*4afad4b7Schristos * Char. number range | UTF-8 octet sequence
26*4afad4b7Schristos * (hexadecimal) | (binary)
27*4afad4b7Schristos * --------------------+---------------------------------------------
28*4afad4b7Schristos * 0000 0000-0000 007F | 0xxxxxxx
29*4afad4b7Schristos * 0000 0080-0000 07FF | 110xxxxx 10xxxxxx
30*4afad4b7Schristos * 0000 0800-0000 FFFF | 1110xxxx 10xxxxxx 10xxxxxx
31*4afad4b7Schristos * 0001 0000-0010 FFFF | 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx
32*4afad4b7Schristos */
33*4afad4b7Schristos bool
isc_utf8_valid(const unsigned char * buf,size_t len)34*4afad4b7Schristos isc_utf8_valid(const unsigned char *buf, size_t len) {
35*4afad4b7Schristos REQUIRE(buf != NULL);
36*4afad4b7Schristos
37*4afad4b7Schristos for (size_t i = 0; i < len; i++) {
38*4afad4b7Schristos if (buf[i] <= 0x7f) {
39*4afad4b7Schristos continue;
40*4afad4b7Schristos }
41*4afad4b7Schristos if ((i + 1) < len && (buf[i] & 0xe0) == 0xc0 &&
42*4afad4b7Schristos (buf[i + 1] & 0xc0) == 0x80)
43*4afad4b7Schristos {
44*4afad4b7Schristos unsigned int w;
45*4afad4b7Schristos w = (buf[i] & 0x1f) << 6;
46*4afad4b7Schristos w |= (buf[++i] & 0x3f);
47*4afad4b7Schristos if (w < 0x80) {
48*4afad4b7Schristos return (false);
49*4afad4b7Schristos }
50*4afad4b7Schristos continue;
51*4afad4b7Schristos }
52*4afad4b7Schristos if ((i + 2) < len && (buf[i] & 0xf0) == 0xe0 &&
53*4afad4b7Schristos (buf[i + 1] & 0xc0) == 0x80 && (buf[i + 2] & 0xc0) == 0x80)
54*4afad4b7Schristos {
55*4afad4b7Schristos unsigned int w;
56*4afad4b7Schristos w = (buf[i] & 0x0f) << 12;
57*4afad4b7Schristos w |= (buf[++i] & 0x3f) << 6;
58*4afad4b7Schristos w |= (buf[++i] & 0x3f);
59*4afad4b7Schristos if (w < 0x0800) {
60*4afad4b7Schristos return (false);
61*4afad4b7Schristos }
62*4afad4b7Schristos continue;
63*4afad4b7Schristos }
64*4afad4b7Schristos if ((i + 3) < len && (buf[i] & 0xf8) == 0xf0 &&
65*4afad4b7Schristos (buf[i + 1] & 0xc0) == 0x80 &&
66*4afad4b7Schristos (buf[i + 2] & 0xc0) == 0x80 && (buf[i + 3] & 0xc0) == 0x80)
67*4afad4b7Schristos {
68*4afad4b7Schristos unsigned int w;
69*4afad4b7Schristos w = (buf[i] & 0x07) << 18;
70*4afad4b7Schristos w |= (buf[++i] & 0x3f) << 12;
71*4afad4b7Schristos w |= (buf[++i] & 0x3f) << 6;
72*4afad4b7Schristos w |= (buf[++i] & 0x3f);
73*4afad4b7Schristos if (w < 0x10000 || w > 0x10FFFF) {
74*4afad4b7Schristos return (false);
75*4afad4b7Schristos }
76*4afad4b7Schristos continue;
77*4afad4b7Schristos }
78*4afad4b7Schristos return (false);
79*4afad4b7Schristos }
80*4afad4b7Schristos return (true);
81*4afad4b7Schristos }
82*4afad4b7Schristos
83*4afad4b7Schristos bool
isc_utf8_bom(const unsigned char * buf,size_t len)84*4afad4b7Schristos isc_utf8_bom(const unsigned char *buf, size_t len) {
85*4afad4b7Schristos REQUIRE(buf != NULL);
86*4afad4b7Schristos
87*4afad4b7Schristos if (len >= 3U && !memcmp(buf, "\xef\xbb\xbf", 3)) {
88*4afad4b7Schristos return (true);
89*4afad4b7Schristos }
90*4afad4b7Schristos return (false);
91*4afad4b7Schristos }
92