1*2fb4f839SGregory Neil Shapiro /* 2*2fb4f839SGregory Neil Shapiro * Copyright (c) 2020 Proofpoint, Inc. and its suppliers. 3*2fb4f839SGregory Neil Shapiro * All rights reserved. 4*2fb4f839SGregory Neil Shapiro * 5*2fb4f839SGregory Neil Shapiro * By using this file, you agree to the terms and conditions set 6*2fb4f839SGregory Neil Shapiro * forth in the LICENSE file which can be found at the top level of 7*2fb4f839SGregory Neil Shapiro * the sendmail distribution. 8*2fb4f839SGregory Neil Shapiro * 9*2fb4f839SGregory Neil Shapiro */ 10*2fb4f839SGregory Neil Shapiro 11*2fb4f839SGregory Neil Shapiro #include <sm/gen.h> 12*2fb4f839SGregory Neil Shapiro #include <sm/sendmail.h> 13*2fb4f839SGregory Neil Shapiro #include <sm/ixlen.h> 14*2fb4f839SGregory Neil Shapiro 15*2fb4f839SGregory Neil Shapiro #if USE_EAI 16*2fb4f839SGregory Neil Shapiro 17*2fb4f839SGregory Neil Shapiro /* 18*2fb4f839SGregory Neil Shapiro ** legal utf-8 byte sequence 19*2fb4f839SGregory Neil Shapiro ** http://www.unicode.org/versions/Unicode6.0.0/ch03.pdf - page 94 20*2fb4f839SGregory Neil Shapiro ** 21*2fb4f839SGregory Neil Shapiro ** Code Points 1st 2s 3s 4s 22*2fb4f839SGregory Neil Shapiro ** U+0000..U+007F 00..7F 23*2fb4f839SGregory Neil Shapiro ** U+0080..U+07FF C2..DF 80..BF 24*2fb4f839SGregory Neil Shapiro ** U+0800..U+0FFF E0 A0..BF 80..BF 25*2fb4f839SGregory Neil Shapiro ** U+1000..U+CFFF E1..EC 80..BF 80..BF 26*2fb4f839SGregory Neil Shapiro ** U+D000..U+D7FF ED 80..9F 80..BF 27*2fb4f839SGregory Neil Shapiro ** U+E000..U+FFFF EE..EF 80..BF 80..BF 28*2fb4f839SGregory Neil Shapiro ** U+10000..U+3FFFF F0 90..BF 80..BF 80..BF 29*2fb4f839SGregory Neil Shapiro ** U+40000..U+FFFFF F1..F3 80..BF 80..BF 80..BF 30*2fb4f839SGregory Neil Shapiro ** U+100000..U+10FFFF F4 80..8F 80..BF 80..BF 31*2fb4f839SGregory Neil Shapiro */ 32*2fb4f839SGregory Neil Shapiro 33*2fb4f839SGregory Neil Shapiro /* 34*2fb4f839SGregory Neil Shapiro ** based on 35*2fb4f839SGregory Neil Shapiro ** https://github.com/lemire/fastvalidate-utf-8.git 36*2fb4f839SGregory Neil Shapiro ** which is distributed under an MIT license (besides others). 37*2fb4f839SGregory Neil Shapiro */ 38*2fb4f839SGregory Neil Shapiro 39*2fb4f839SGregory Neil Shapiro bool utf8_valid(b,length)40*2fb4f839SGregory Neil Shapiroutf8_valid(b, length) 41*2fb4f839SGregory Neil Shapiro const char *b; 42*2fb4f839SGregory Neil Shapiro size_t length; 43*2fb4f839SGregory Neil Shapiro { 44*2fb4f839SGregory Neil Shapiro const unsigned char *bytes; 45*2fb4f839SGregory Neil Shapiro size_t index; 46*2fb4f839SGregory Neil Shapiro 47*2fb4f839SGregory Neil Shapiro bytes = (const unsigned char *)b; 48*2fb4f839SGregory Neil Shapiro index = 0; 49*2fb4f839SGregory Neil Shapiro while (true) 50*2fb4f839SGregory Neil Shapiro { 51*2fb4f839SGregory Neil Shapiro unsigned char byte1; 52*2fb4f839SGregory Neil Shapiro 53*2fb4f839SGregory Neil Shapiro do { /* fast ASCII Path */ 54*2fb4f839SGregory Neil Shapiro if (index >= length) 55*2fb4f839SGregory Neil Shapiro return true; 56*2fb4f839SGregory Neil Shapiro byte1 = bytes[index++]; 57*2fb4f839SGregory Neil Shapiro } while (byte1 < 0x80); 58*2fb4f839SGregory Neil Shapiro if (byte1 < 0xE0) 59*2fb4f839SGregory Neil Shapiro { 60*2fb4f839SGregory Neil Shapiro /* Two-byte form. */ 61*2fb4f839SGregory Neil Shapiro if (index == length) 62*2fb4f839SGregory Neil Shapiro return false; 63*2fb4f839SGregory Neil Shapiro if (byte1 < 0xC2 || bytes[index++] > 0xBF) 64*2fb4f839SGregory Neil Shapiro return false; 65*2fb4f839SGregory Neil Shapiro } 66*2fb4f839SGregory Neil Shapiro else if (byte1 < 0xF0) 67*2fb4f839SGregory Neil Shapiro { 68*2fb4f839SGregory Neil Shapiro /* Three-byte form. */ 69*2fb4f839SGregory Neil Shapiro if (index + 1 >= length) 70*2fb4f839SGregory Neil Shapiro return false; 71*2fb4f839SGregory Neil Shapiro unsigned char byte2 = bytes[index++]; 72*2fb4f839SGregory Neil Shapiro if (byte2 > 0xBF 73*2fb4f839SGregory Neil Shapiro /* Overlong? 5 most significant bits must not all be zero. */ 74*2fb4f839SGregory Neil Shapiro || (byte1 == 0xE0 && byte2 < 0xA0) 75*2fb4f839SGregory Neil Shapiro /* Check for illegal surrogate codepoints. */ 76*2fb4f839SGregory Neil Shapiro || (byte1 == 0xED && 0xA0 <= byte2) 77*2fb4f839SGregory Neil Shapiro /* Third byte trailing-byte test. */ 78*2fb4f839SGregory Neil Shapiro || bytes[index++] > 0xBF) 79*2fb4f839SGregory Neil Shapiro return false; 80*2fb4f839SGregory Neil Shapiro } 81*2fb4f839SGregory Neil Shapiro else 82*2fb4f839SGregory Neil Shapiro { 83*2fb4f839SGregory Neil Shapiro 84*2fb4f839SGregory Neil Shapiro /* Four-byte form. */ 85*2fb4f839SGregory Neil Shapiro if (index + 2 >= length) 86*2fb4f839SGregory Neil Shapiro return false; 87*2fb4f839SGregory Neil Shapiro int byte2 = bytes[index++]; 88*2fb4f839SGregory Neil Shapiro if (byte2 > 0xBF 89*2fb4f839SGregory Neil Shapiro /* Check that 1 <= plane <= 16. Tricky optimized form of: */ 90*2fb4f839SGregory Neil Shapiro /* if (byte1 > (byte) 0xF4 */ 91*2fb4f839SGregory Neil Shapiro /* || byte1 == (byte) 0xF0 && byte2 < (byte) 0x90 */ 92*2fb4f839SGregory Neil Shapiro /* || byte1 == (byte) 0xF4 && byte2 > (byte) 0x8F) */ 93*2fb4f839SGregory Neil Shapiro || (((byte1 << 28) + (byte2 - 0x90)) >> 30) != 0 94*2fb4f839SGregory Neil Shapiro /* Third byte trailing-byte test */ 95*2fb4f839SGregory Neil Shapiro || bytes[index++] > 0xBF 96*2fb4f839SGregory Neil Shapiro /* Fourth byte trailing-byte test */ 97*2fb4f839SGregory Neil Shapiro || bytes[index++] > 0xBF) 98*2fb4f839SGregory Neil Shapiro return false; 99*2fb4f839SGregory Neil Shapiro } 100*2fb4f839SGregory Neil Shapiro } 101*2fb4f839SGregory Neil Shapiro /* NOTREACHED */ 102*2fb4f839SGregory Neil Shapiro return false; 103*2fb4f839SGregory Neil Shapiro } 104*2fb4f839SGregory Neil Shapiro #endif /* USE_EAI */ 105