1 /* $NetBSD: valid_utf8_string.c,v 1.2 2017/02/14 01:16:49 christos Exp $ */
2
3 /*++
4 /* NAME
5 /* valid_utf8_string 3
6 /* SUMMARY
7 /* predicate if string is valid UTF-8
8 /* SYNOPSIS
9 /* #include <stringops.h>
10 /*
11 /* int valid_utf8_string(str, len)
12 /* const char *str;
13 /* ssize_t len;
14 /* DESCRIPTION
15 /* valid_utf8_string() determines if a string satisfies the UTF-8
16 /* definition in RFC 3629. That is, it contains proper encodings
17 /* of code points U+0000..U+10FFFF, excluding over-long encodings
18 /* and excluding U+D800..U+DFFF surrogates.
19 /*
20 /* A zero-length string is considered valid.
21 /* DIAGNOSTICS
22 /* The result value is zero when the caller specifies a negative
23 /* length, or a string that violates RFC 3629, for example a
24 /* string that is truncated in the middle of a multi-byte
25 /* sequence.
26 /* BUGS
27 /* But wait, there is more. Code points in the range U+FDD0..U+FDEF
28 /* and ending in FFFE or FFFF are non-characters in UNICODE. This
29 /* function does not block these.
30 /* SEE ALSO
31 /* RFC 3629
32 /* LICENSE
33 /* .ad
34 /* .fi
35 /* The Secure Mailer license must be distributed with this software.
36 /* AUTHOR(S)
37 /* Wietse Venema
38 /* IBM T.J. Watson Research
39 /* P.O. Box 704
40 /* Yorktown Heights, NY 10598, USA
41 /*--*/
42
43 /* System library. */
44
45 #include <sys_defs.h>
46
47 /* Utility library. */
48
49 #include <stringops.h>
50
51 /* valid_utf8_string - validate string according to RFC 3629 */
52
valid_utf8_string(const char * str,ssize_t len)53 int valid_utf8_string(const char *str, ssize_t len)
54 {
55 const unsigned char *end = (const unsigned char *) str + len;
56 const unsigned char *cp;
57 unsigned char c0, ch;
58
59 if (len < 0)
60 return (0);
61 if (len <= 0)
62 return (1);
63
64 /*
65 * Optimized for correct input, time, space, and for CPUs that have a
66 * decent number of registers.
67 */
68 for (cp = (const unsigned char *) str; cp < end; cp++) {
69 /* Single-byte encodings. */
70 if (EXPECTED((c0 = *cp) <= 0x7f) /* we know that c0 >= 0x0 */ ) {
71 /* void */ ;
72 }
73 /* Two-byte encodings. */
74 else if (EXPECTED(c0 <= 0xdf) /* we know that c0 >= 0x80 */ ) {
75 /* Exclude over-long encodings. */
76 if (UNEXPECTED(c0 < 0xc2)
77 || UNEXPECTED(cp + 1 >= end)
78 /* Require UTF-8 tail byte. */
79 || UNEXPECTED(((ch = *++cp) & 0xc0) != 0x80))
80 return (0);
81 }
82 /* Three-byte encodings. */
83 else if (EXPECTED(c0 <= 0xef) /* we know that c0 >= 0xe0 */ ) {
84 if (UNEXPECTED(cp + 2 >= end)
85 /* Exclude over-long encodings. */
86 || UNEXPECTED((ch = *++cp) < (c0 == 0xe0 ? 0xa0 : 0x80))
87 /* Exclude U+D800..U+DFFF. */
88 || UNEXPECTED(ch > (c0 == 0xed ? 0x9f : 0xbf))
89 /* Require UTF-8 tail byte. */
90 || UNEXPECTED(((ch = *++cp) & 0xc0) != 0x80))
91 return (0);
92 }
93 /* Four-byte encodings. */
94 else if (EXPECTED(c0 <= 0xf4) /* we know that c0 >= 0xf0 */ ) {
95 if (UNEXPECTED(cp + 3 >= end)
96 /* Exclude over-long encodings. */
97 || UNEXPECTED((ch = *++cp) < (c0 == 0xf0 ? 0x90 : 0x80))
98 /* Exclude code points above U+10FFFF. */
99 || UNEXPECTED(ch > (c0 == 0xf4 ? 0x8f : 0xbf))
100 /* Require UTF-8 tail byte. */
101 || UNEXPECTED(((ch = *++cp) & 0xc0) != 0x80)
102 /* Require UTF-8 tail byte. */
103 || UNEXPECTED(((ch = *++cp) & 0xc0) != 0x80))
104 return (0);
105 }
106 /* Invalid: c0 >= 0xf5 */
107 else {
108 return (0);
109 }
110 }
111 return (1);
112 }
113
114 /*
115 * Stand-alone test program. Each string is a line without line terminator.
116 */
117 #ifdef TEST
118 #include <stdlib.h>
119 #include <vstream.h>
120 #include <vstring.h>
121 #include <vstring_vstream.h>
122
123 #define STR(x) vstring_str(x)
124 #define LEN(x) VSTRING_LEN(x)
125
main(void)126 int main(void)
127 {
128 VSTRING *buf = vstring_alloc(1);
129
130 while (vstring_get_nonl(buf, VSTREAM_IN) != VSTREAM_EOF) {
131 vstream_printf("%c", (LEN(buf) && !valid_utf8_string(STR(buf), LEN(buf))) ?
132 '!' : ' ');
133 vstream_fwrite(VSTREAM_OUT, STR(buf), LEN(buf));
134 vstream_printf("\n");
135 }
136 vstream_fflush(VSTREAM_OUT);
137 vstring_free(buf);
138 exit(0);
139 }
140
141 #endif
142