xref: /netbsd-src/external/ibm-public/postfix/dist/src/util/valid_utf8_string.c (revision e89934bbf778a6d6d6894877c4da59d0c7835b0f)
1 /*	$NetBSD: valid_utf8_string.c,v 1.2 2017/02/14 01:16:49 christos Exp $	*/
2 
3 /*++
4 /* NAME
5 /*	valid_utf8_string 3
6 /* SUMMARY
7 /*	predicate if string is valid UTF-8
8 /* SYNOPSIS
9 /*	#include <stringops.h>
10 /*
11 /*	int	valid_utf8_string(str, len)
12 /*	const char *str;
13 /*	ssize_t	len;
14 /* DESCRIPTION
15 /*	valid_utf8_string() determines if a string satisfies the UTF-8
16 /*	definition in RFC 3629. That is, it contains proper encodings
17 /*	of code points U+0000..U+10FFFF, excluding over-long encodings
18 /*	and excluding U+D800..U+DFFF surrogates.
19 /*
20 /*	A zero-length string is considered valid.
21 /* DIAGNOSTICS
22 /*	The result value is zero when the caller specifies a negative
23 /*	length, or a string that violates RFC 3629, for example a
24 /*	string that is truncated in the middle of a multi-byte
25 /*	sequence.
26 /* BUGS
27 /*	But wait, there is more. Code points in the range U+FDD0..U+FDEF
28 /*	and ending in FFFE or FFFF are non-characters in UNICODE. This
29 /*	function does not block these.
30 /* SEE ALSO
31 /*	RFC 3629
32 /* LICENSE
33 /* .ad
34 /* .fi
35 /*	The Secure Mailer license must be distributed with this software.
36 /* AUTHOR(S)
37 /*	Wietse Venema
38 /*	IBM T.J. Watson Research
39 /*	P.O. Box 704
40 /*	Yorktown Heights, NY 10598, USA
41 /*--*/
42 
43 /* System library. */
44 
45 #include <sys_defs.h>
46 
47 /* Utility library. */
48 
49 #include <stringops.h>
50 
51 /* valid_utf8_string - validate string according to RFC 3629 */
52 
valid_utf8_string(const char * str,ssize_t len)53 int     valid_utf8_string(const char *str, ssize_t len)
54 {
55     const unsigned char *end = (const unsigned char *) str + len;
56     const unsigned char *cp;
57     unsigned char c0, ch;
58 
59     if (len < 0)
60 	return (0);
61     if (len <= 0)
62 	return (1);
63 
64     /*
65      * Optimized for correct input, time, space, and for CPUs that have a
66      * decent number of registers.
67      */
68     for (cp = (const unsigned char *) str; cp < end; cp++) {
69 	/* Single-byte encodings. */
70 	if (EXPECTED((c0 = *cp) <= 0x7f) /* we know that c0 >= 0x0 */ ) {
71 	     /* void */ ;
72 	}
73 	/* Two-byte encodings. */
74 	else if (EXPECTED(c0 <= 0xdf) /* we know that c0 >= 0x80 */ ) {
75 	    /* Exclude over-long encodings. */
76 	    if (UNEXPECTED(c0 < 0xc2)
77 		|| UNEXPECTED(cp + 1 >= end)
78 	    /* Require UTF-8 tail byte. */
79 		|| UNEXPECTED(((ch = *++cp) & 0xc0) != 0x80))
80 		return (0);
81 	}
82 	/* Three-byte encodings. */
83 	else if (EXPECTED(c0 <= 0xef) /* we know that c0 >= 0xe0 */ ) {
84 	    if (UNEXPECTED(cp + 2 >= end)
85 	    /* Exclude over-long encodings. */
86 		|| UNEXPECTED((ch = *++cp) < (c0 == 0xe0 ? 0xa0 : 0x80))
87 	    /* Exclude U+D800..U+DFFF. */
88 		|| UNEXPECTED(ch > (c0 == 0xed ? 0x9f : 0xbf))
89 	    /* Require UTF-8 tail byte. */
90 		|| UNEXPECTED(((ch = *++cp) & 0xc0) != 0x80))
91 		return (0);
92 	}
93 	/* Four-byte encodings. */
94 	else if (EXPECTED(c0 <= 0xf4) /* we know that c0 >= 0xf0 */ ) {
95 	    if (UNEXPECTED(cp + 3 >= end)
96 	    /* Exclude over-long encodings. */
97 		|| UNEXPECTED((ch = *++cp) < (c0 == 0xf0 ? 0x90 : 0x80))
98 	    /* Exclude code points above U+10FFFF. */
99 		|| UNEXPECTED(ch > (c0 == 0xf4 ? 0x8f : 0xbf))
100 	    /* Require UTF-8 tail byte. */
101 		|| UNEXPECTED(((ch = *++cp) & 0xc0) != 0x80)
102 	    /* Require UTF-8 tail byte. */
103 		|| UNEXPECTED(((ch = *++cp) & 0xc0) != 0x80))
104 		return (0);
105 	}
106 	/* Invalid: c0 >= 0xf5 */
107 	else {
108 	    return (0);
109 	}
110     }
111     return (1);
112 }
113 
114  /*
115   * Stand-alone test program. Each string is a line without line terminator.
116   */
117 #ifdef TEST
118 #include <stdlib.h>
119 #include <vstream.h>
120 #include <vstring.h>
121 #include <vstring_vstream.h>
122 
123 #define STR(x) vstring_str(x)
124 #define LEN(x) VSTRING_LEN(x)
125 
main(void)126 int     main(void)
127 {
128     VSTRING *buf = vstring_alloc(1);
129 
130     while (vstring_get_nonl(buf, VSTREAM_IN) != VSTREAM_EOF) {
131 	vstream_printf("%c", (LEN(buf) && !valid_utf8_string(STR(buf), LEN(buf))) ?
132 		       '!' : ' ');
133 	vstream_fwrite(VSTREAM_OUT, STR(buf), LEN(buf));
134 	vstream_printf("\n");
135     }
136     vstream_fflush(VSTREAM_OUT);
137     vstring_free(buf);
138     exit(0);
139 }
140 
141 #endif
142