1 /* $NetBSD: valid_utf8_string.c,v 1.2 2017/02/14 01:16:49 christos Exp $ */ 2 3 /*++ 4 /* NAME 5 /* valid_utf8_string 3 6 /* SUMMARY 7 /* predicate if string is valid UTF-8 8 /* SYNOPSIS 9 /* #include <stringops.h> 10 /* 11 /* int valid_utf8_string(str, len) 12 /* const char *str; 13 /* ssize_t len; 14 /* DESCRIPTION 15 /* valid_utf8_string() determines if a string satisfies the UTF-8 16 /* definition in RFC 3629. That is, it contains proper encodings 17 /* of code points U+0000..U+10FFFF, excluding over-long encodings 18 /* and excluding U+D800..U+DFFF surrogates. 19 /* 20 /* A zero-length string is considered valid. 21 /* DIAGNOSTICS 22 /* The result value is zero when the caller specifies a negative 23 /* length, or a string that violates RFC 3629, for example a 24 /* string that is truncated in the middle of a multi-byte 25 /* sequence. 26 /* BUGS 27 /* But wait, there is more. Code points in the range U+FDD0..U+FDEF 28 /* and ending in FFFE or FFFF are non-characters in UNICODE. This 29 /* function does not block these. 30 /* SEE ALSO 31 /* RFC 3629 32 /* LICENSE 33 /* .ad 34 /* .fi 35 /* The Secure Mailer license must be distributed with this software. 36 /* AUTHOR(S) 37 /* Wietse Venema 38 /* IBM T.J. Watson Research 39 /* P.O. Box 704 40 /* Yorktown Heights, NY 10598, USA 41 /*--*/ 42 43 /* System library. */ 44 45 #include <sys_defs.h> 46 47 /* Utility library. */ 48 49 #include <stringops.h> 50 51 /* valid_utf8_string - validate string according to RFC 3629 */ 52 53 int valid_utf8_string(const char *str, ssize_t len) 54 { 55 const unsigned char *end = (const unsigned char *) str + len; 56 const unsigned char *cp; 57 unsigned char c0, ch; 58 59 if (len < 0) 60 return (0); 61 if (len <= 0) 62 return (1); 63 64 /* 65 * Optimized for correct input, time, space, and for CPUs that have a 66 * decent number of registers. 67 */ 68 for (cp = (const unsigned char *) str; cp < end; cp++) { 69 /* Single-byte encodings. */ 70 if (EXPECTED((c0 = *cp) <= 0x7f) /* we know that c0 >= 0x0 */ ) { 71 /* void */ ; 72 } 73 /* Two-byte encodings. */ 74 else if (EXPECTED(c0 <= 0xdf) /* we know that c0 >= 0x80 */ ) { 75 /* Exclude over-long encodings. */ 76 if (UNEXPECTED(c0 < 0xc2) 77 || UNEXPECTED(cp + 1 >= end) 78 /* Require UTF-8 tail byte. */ 79 || UNEXPECTED(((ch = *++cp) & 0xc0) != 0x80)) 80 return (0); 81 } 82 /* Three-byte encodings. */ 83 else if (EXPECTED(c0 <= 0xef) /* we know that c0 >= 0xe0 */ ) { 84 if (UNEXPECTED(cp + 2 >= end) 85 /* Exclude over-long encodings. */ 86 || UNEXPECTED((ch = *++cp) < (c0 == 0xe0 ? 0xa0 : 0x80)) 87 /* Exclude U+D800..U+DFFF. */ 88 || UNEXPECTED(ch > (c0 == 0xed ? 0x9f : 0xbf)) 89 /* Require UTF-8 tail byte. */ 90 || UNEXPECTED(((ch = *++cp) & 0xc0) != 0x80)) 91 return (0); 92 } 93 /* Four-byte encodings. */ 94 else if (EXPECTED(c0 <= 0xf4) /* we know that c0 >= 0xf0 */ ) { 95 if (UNEXPECTED(cp + 3 >= end) 96 /* Exclude over-long encodings. */ 97 || UNEXPECTED((ch = *++cp) < (c0 == 0xf0 ? 0x90 : 0x80)) 98 /* Exclude code points above U+10FFFF. */ 99 || UNEXPECTED(ch > (c0 == 0xf4 ? 0x8f : 0xbf)) 100 /* Require UTF-8 tail byte. */ 101 || UNEXPECTED(((ch = *++cp) & 0xc0) != 0x80) 102 /* Require UTF-8 tail byte. */ 103 || UNEXPECTED(((ch = *++cp) & 0xc0) != 0x80)) 104 return (0); 105 } 106 /* Invalid: c0 >= 0xf5 */ 107 else { 108 return (0); 109 } 110 } 111 return (1); 112 } 113 114 /* 115 * Stand-alone test program. Each string is a line without line terminator. 116 */ 117 #ifdef TEST 118 #include <stdlib.h> 119 #include <vstream.h> 120 #include <vstring.h> 121 #include <vstring_vstream.h> 122 123 #define STR(x) vstring_str(x) 124 #define LEN(x) VSTRING_LEN(x) 125 126 int main(void) 127 { 128 VSTRING *buf = vstring_alloc(1); 129 130 while (vstring_get_nonl(buf, VSTREAM_IN) != VSTREAM_EOF) { 131 vstream_printf("%c", (LEN(buf) && !valid_utf8_string(STR(buf), LEN(buf))) ? 132 '!' : ' '); 133 vstream_fwrite(VSTREAM_OUT, STR(buf), LEN(buf)); 134 vstream_printf("\n"); 135 } 136 vstream_fflush(VSTREAM_OUT); 137 vstring_free(buf); 138 exit(0); 139 } 140 141 #endif 142