xref: /freebsd-src/contrib/sendmail/libsm/utf8_valid.c (revision 2fb4f839f3fc72ce2bab12f9ba4760f97f73e97f)
1*2fb4f839SGregory Neil Shapiro /*
2*2fb4f839SGregory Neil Shapiro  * Copyright (c) 2020 Proofpoint, Inc. and its suppliers.
3*2fb4f839SGregory Neil Shapiro  *	All rights reserved.
4*2fb4f839SGregory Neil Shapiro  *
5*2fb4f839SGregory Neil Shapiro  * By using this file, you agree to the terms and conditions set
6*2fb4f839SGregory Neil Shapiro  * forth in the LICENSE file which can be found at the top level of
7*2fb4f839SGregory Neil Shapiro  * the sendmail distribution.
8*2fb4f839SGregory Neil Shapiro  *
9*2fb4f839SGregory Neil Shapiro  */
10*2fb4f839SGregory Neil Shapiro 
11*2fb4f839SGregory Neil Shapiro #include <sm/gen.h>
12*2fb4f839SGregory Neil Shapiro #include <sm/sendmail.h>
13*2fb4f839SGregory Neil Shapiro #include <sm/ixlen.h>
14*2fb4f839SGregory Neil Shapiro 
15*2fb4f839SGregory Neil Shapiro #if USE_EAI
16*2fb4f839SGregory Neil Shapiro 
17*2fb4f839SGregory Neil Shapiro /*
18*2fb4f839SGregory Neil Shapiro **  legal utf-8 byte sequence
19*2fb4f839SGregory Neil Shapiro **  http://www.unicode.org/versions/Unicode6.0.0/ch03.pdf - page 94
20*2fb4f839SGregory Neil Shapiro **
21*2fb4f839SGregory Neil Shapiro **   Code Points        1st       2s       3s       4s
22*2fb4f839SGregory Neil Shapiro **  U+0000..U+007F     00..7F
23*2fb4f839SGregory Neil Shapiro **  U+0080..U+07FF     C2..DF   80..BF
24*2fb4f839SGregory Neil Shapiro **  U+0800..U+0FFF     E0       A0..BF   80..BF
25*2fb4f839SGregory Neil Shapiro **  U+1000..U+CFFF     E1..EC   80..BF   80..BF
26*2fb4f839SGregory Neil Shapiro **  U+D000..U+D7FF     ED       80..9F   80..BF
27*2fb4f839SGregory Neil Shapiro **  U+E000..U+FFFF     EE..EF   80..BF   80..BF
28*2fb4f839SGregory Neil Shapiro **  U+10000..U+3FFFF   F0       90..BF   80..BF   80..BF
29*2fb4f839SGregory Neil Shapiro **  U+40000..U+FFFFF   F1..F3   80..BF   80..BF   80..BF
30*2fb4f839SGregory Neil Shapiro **  U+100000..U+10FFFF F4       80..8F   80..BF   80..BF
31*2fb4f839SGregory Neil Shapiro */
32*2fb4f839SGregory Neil Shapiro 
33*2fb4f839SGregory Neil Shapiro /*
34*2fb4f839SGregory Neil Shapiro **  based on
35*2fb4f839SGregory Neil Shapiro **  https://github.com/lemire/fastvalidate-utf-8.git
36*2fb4f839SGregory Neil Shapiro **  which is distributed under an MIT license (besides others).
37*2fb4f839SGregory Neil Shapiro */
38*2fb4f839SGregory Neil Shapiro 
39*2fb4f839SGregory Neil Shapiro bool
utf8_valid(b,length)40*2fb4f839SGregory Neil Shapiro utf8_valid(b, length)
41*2fb4f839SGregory Neil Shapiro 	const char *b;
42*2fb4f839SGregory Neil Shapiro 	size_t length;
43*2fb4f839SGregory Neil Shapiro {
44*2fb4f839SGregory Neil Shapiro 	const unsigned char *bytes;
45*2fb4f839SGregory Neil Shapiro 	size_t index;
46*2fb4f839SGregory Neil Shapiro 
47*2fb4f839SGregory Neil Shapiro 	bytes = (const unsigned char *)b;
48*2fb4f839SGregory Neil Shapiro 	index = 0;
49*2fb4f839SGregory Neil Shapiro 	while (true)
50*2fb4f839SGregory Neil Shapiro 	{
51*2fb4f839SGregory Neil Shapiro 		unsigned char byte1;
52*2fb4f839SGregory Neil Shapiro 
53*2fb4f839SGregory Neil Shapiro 		do { /* fast ASCII Path */
54*2fb4f839SGregory Neil Shapiro 			if (index >= length)
55*2fb4f839SGregory Neil Shapiro 				return true;
56*2fb4f839SGregory Neil Shapiro 			byte1 = bytes[index++];
57*2fb4f839SGregory Neil Shapiro 		} while (byte1 < 0x80);
58*2fb4f839SGregory Neil Shapiro 		if (byte1 < 0xE0)
59*2fb4f839SGregory Neil Shapiro 		{
60*2fb4f839SGregory Neil Shapiro 			/* Two-byte form. */
61*2fb4f839SGregory Neil Shapiro 			if (index == length)
62*2fb4f839SGregory Neil Shapiro 				return false;
63*2fb4f839SGregory Neil Shapiro 			if (byte1 < 0xC2 || bytes[index++] > 0xBF)
64*2fb4f839SGregory Neil Shapiro 				return false;
65*2fb4f839SGregory Neil Shapiro 		}
66*2fb4f839SGregory Neil Shapiro 		else if (byte1 < 0xF0)
67*2fb4f839SGregory Neil Shapiro 		{
68*2fb4f839SGregory Neil Shapiro 			/* Three-byte form. */
69*2fb4f839SGregory Neil Shapiro 			if (index + 1 >= length)
70*2fb4f839SGregory Neil Shapiro 				return false;
71*2fb4f839SGregory Neil Shapiro 			unsigned char byte2 = bytes[index++];
72*2fb4f839SGregory Neil Shapiro 			if (byte2 > 0xBF
73*2fb4f839SGregory Neil Shapiro 			    /* Overlong? 5 most significant bits must not all be zero. */
74*2fb4f839SGregory Neil Shapiro 			    || (byte1 == 0xE0 && byte2 < 0xA0)
75*2fb4f839SGregory Neil Shapiro 			    /* Check for illegal surrogate codepoints. */
76*2fb4f839SGregory Neil Shapiro 			    || (byte1 == 0xED && 0xA0 <= byte2)
77*2fb4f839SGregory Neil Shapiro 			    /* Third byte trailing-byte test. */
78*2fb4f839SGregory Neil Shapiro 			    || bytes[index++] > 0xBF)
79*2fb4f839SGregory Neil Shapiro 				return false;
80*2fb4f839SGregory Neil Shapiro 		}
81*2fb4f839SGregory Neil Shapiro 		else
82*2fb4f839SGregory Neil Shapiro 		{
83*2fb4f839SGregory Neil Shapiro 
84*2fb4f839SGregory Neil Shapiro 			/* Four-byte form. */
85*2fb4f839SGregory Neil Shapiro 			if (index + 2 >= length)
86*2fb4f839SGregory Neil Shapiro 				return false;
87*2fb4f839SGregory Neil Shapiro 			int byte2 = bytes[index++];
88*2fb4f839SGregory Neil Shapiro 			if (byte2 > 0xBF
89*2fb4f839SGregory Neil Shapiro 			    /* Check that 1 <= plane <= 16. Tricky optimized form of: */
90*2fb4f839SGregory Neil Shapiro 			    /* if (byte1 > (byte) 0xF4 */
91*2fb4f839SGregory Neil Shapiro 			    /*    || byte1 == (byte) 0xF0 && byte2 < (byte) 0x90 */
92*2fb4f839SGregory Neil Shapiro 			    /*    || byte1 == (byte) 0xF4 && byte2 > (byte) 0x8F) */
93*2fb4f839SGregory Neil Shapiro 			    || (((byte1 << 28) + (byte2 - 0x90)) >> 30) != 0
94*2fb4f839SGregory Neil Shapiro 			    /* Third byte trailing-byte test */
95*2fb4f839SGregory Neil Shapiro 			    || bytes[index++] > 0xBF
96*2fb4f839SGregory Neil Shapiro 			    /* Fourth byte trailing-byte test */
97*2fb4f839SGregory Neil Shapiro 			    || bytes[index++] > 0xBF)
98*2fb4f839SGregory Neil Shapiro 				return false;
99*2fb4f839SGregory Neil Shapiro 		}
100*2fb4f839SGregory Neil Shapiro 	}
101*2fb4f839SGregory Neil Shapiro 	/* NOTREACHED */
102*2fb4f839SGregory Neil Shapiro 	return false;
103*2fb4f839SGregory Neil Shapiro }
104*2fb4f839SGregory Neil Shapiro #endif /* USE_EAI */
105