1 /* 2 * Copyright (c) 2014 Philip Guenther <guenther@openbsd.org> 3 * 4 * Permission to use, copy, modify, and distribute this software for any 5 * purpose with or without fee is hereby granted, provided that the above 6 * copyright notice and this permission notice appear in all copies. 7 * 8 * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES 9 * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF 10 * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR 11 * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES 12 * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN 13 * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF 14 * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. 15 */ 16 17 /* 18 * A mostly exhaustive test of UTF-8 decoder and encoder 19 */ 20 21 #include <stdio.h> 22 #include <string.h> 23 #include <err.h> 24 25 #include <openssl/asn1.h> 26 27 #define UNCHANGED 0xfedcba98 28 29 #define ASSERT(x) \ 30 do { \ 31 if (!(x)) \ 32 errx(1, "test failed at line %d: %s", \ 33 __LINE__, #x); \ 34 } while (0) 35 36 int 37 main(void) 38 { 39 unsigned char testbuf[] = "012345"; 40 const unsigned char zerobuf[sizeof testbuf] = { 0 }; 41 unsigned long value; 42 int i, j, k, l, ret; 43 44 /* 45 * First, verify UTF8_getc() 46 */ 47 value = UNCHANGED; 48 ret = UTF8_getc(testbuf, 0, &value); 49 ASSERT(ret == 0); 50 ASSERT(value == UNCHANGED); 51 52 /* check all valid single-byte chars */ 53 for (i = 0; i < 0x80; i++) { 54 testbuf[0] = i; 55 ret = UTF8_getc(testbuf, 1, &value); 56 ASSERT(ret == 1); 57 ASSERT(value == i); 58 59 ret = UTF8_getc(testbuf, 2, &value); 60 ASSERT(ret == 1); 61 ASSERT(value == i); 62 } 63 64 /* 65 * Verify failure on all invalid initial bytes: 66 * 0x80 - 0xBF following bytes only 67 * 0xC0 - 0xC1 used to be in non-shortest forms 68 * 0xF5 - 0xFD used to be initial for 5 and 6 byte sequences 69 * 0xFE - 0xFF have never been valid in utf-8 70 */ 71 for (i = 0x80; i < 0xC2; i++) { 72 value = UNCHANGED; 73 testbuf[0] = i; 74 ret = UTF8_getc(testbuf, 1, &value); 75 ASSERT(ret == -2); 76 ASSERT(value == UNCHANGED); 77 } 78 for (i = 0xF5; i < 0x100; i++) { 79 value = UNCHANGED; 80 testbuf[0] = i; 81 ret = UTF8_getc(testbuf, 1, &value); 82 ASSERT(ret == -2); 83 ASSERT(value == UNCHANGED); 84 } 85 86 /* 87 * Verify handling of all two-byte sequences 88 */ 89 for (i = 0xC2; i < 0xE0; i++) { 90 testbuf[0] = i; 91 92 for (j = 0; j < 0x100; j++) { 93 testbuf[1] = j; 94 95 value = UNCHANGED; 96 ret = UTF8_getc(testbuf, 1, &value); 97 ASSERT(ret == -1); 98 ASSERT(value == UNCHANGED); 99 100 ret = UTF8_getc(testbuf, 2, &value); 101 102 /* outside range of trailing bytes */ 103 if (j < 0x80 || j > 0xBF) { 104 ASSERT(ret == -3); 105 ASSERT(value == UNCHANGED); 106 continue; 107 } 108 109 /* valid */ 110 ASSERT(ret == 2); 111 ASSERT((value & 0x3F) == (j & 0x3F)); 112 ASSERT(value >> 6 == (i & 0x1F)); 113 } 114 } 115 116 /* 117 * Verify handling of all three-byte sequences 118 */ 119 for (i = 0xE0; i < 0xF0; i++) { 120 testbuf[0] = i; 121 122 for (j = 0; j < 0x100; j++) { 123 testbuf[1] = j; 124 125 for (k = 0; k < 0x100; k++) { 126 testbuf[2] = k; 127 128 value = UNCHANGED; 129 ret = UTF8_getc(testbuf, 2, &value); 130 ASSERT(ret == -1); 131 ASSERT(value == UNCHANGED); 132 133 ret = UTF8_getc(testbuf, 3, &value); 134 135 /* outside range of trailing bytes */ 136 if (j < 0x80 || j > 0xBF || 137 k < 0x80 || k > 0xBF) { 138 ASSERT(ret == -3); 139 ASSERT(value == UNCHANGED); 140 continue; 141 } 142 143 /* non-shortest form */ 144 if (i == 0xE0 && j < 0xA0) { 145 ASSERT(ret == -4); 146 ASSERT(value == UNCHANGED); 147 continue; 148 } 149 150 /* surrogate pair code point */ 151 if (i == 0xED && j > 0x9F) { 152 ASSERT(ret == -2); 153 ASSERT(value == UNCHANGED); 154 continue; 155 } 156 157 ASSERT(ret == 3); 158 ASSERT((value & 0x3F) == (k & 0x3F)); 159 ASSERT(((value >> 6) & 0x3F) == (j & 0x3F)); 160 ASSERT(value >> 12 == (i & 0x0F)); 161 } 162 } 163 } 164 165 /* 166 * Verify handling of all four-byte sequences 167 */ 168 for (i = 0xF0; i < 0xF5; i++) { 169 testbuf[0] = i; 170 171 for (j = 0; j < 0x100; j++) { 172 testbuf[1] = j; 173 174 for (k = 0; k < 0x100; k++) { 175 testbuf[2] = k; 176 177 for (l = 0; l < 0x100; l++) { 178 testbuf[3] = l; 179 180 value = UNCHANGED; 181 ret = UTF8_getc(testbuf, 3, &value); 182 ASSERT(ret == -1); 183 ASSERT(value == UNCHANGED); 184 185 ret = UTF8_getc(testbuf, 4, &value); 186 187 /* outside range of trailing bytes */ 188 if (j < 0x80 || j > 0xBF || 189 k < 0x80 || k > 0xBF || 190 l < 0x80 || l > 0xBF) { 191 ASSERT(ret == -3); 192 ASSERT(value == UNCHANGED); 193 continue; 194 } 195 196 /* non-shortest form */ 197 if (i == 0xF0 && j < 0x90) { 198 ASSERT(ret == -4); 199 ASSERT(value == UNCHANGED); 200 continue; 201 } 202 203 /* beyond end of UCS range */ 204 if (i == 0xF4 && j > 0x8F) { 205 ASSERT(ret == -2); 206 ASSERT(value == UNCHANGED); 207 continue; 208 } 209 210 ASSERT(ret == 4); 211 ASSERT((value & 0x3F) == (l & 0x3F)); 212 ASSERT(((value >> 6) & 0x3F) == 213 (k & 0x3F)); 214 ASSERT(((value >> 12) & 0x3F) == 215 (j & 0x3F)); 216 ASSERT(value >> 18 == (i & 0x07)); 217 } 218 } 219 } 220 } 221 222 223 /* 224 * Next, verify UTF8_putc() 225 */ 226 memset(testbuf, 0, sizeof testbuf); 227 228 /* single-byte sequences */ 229 for (i = 0; i < 0x80; i++) { 230 ret = UTF8_putc(NULL, 0, i); 231 ASSERT(ret == 1); 232 233 testbuf[0] = 0; 234 ret = UTF8_putc(testbuf, 0, i); 235 ASSERT(ret == -1); 236 ASSERT(memcmp(testbuf, zerobuf, sizeof testbuf) == 0); 237 238 ret = UTF8_putc(testbuf, 1, i); 239 ASSERT(ret == 1); 240 ASSERT(testbuf[0] == i); 241 ASSERT(memcmp(testbuf+1, zerobuf, sizeof(testbuf)-1) == 0); 242 } 243 244 /* two-byte sequences */ 245 for (i = 0x80; i < 0x800; i++) { 246 ret = UTF8_putc(NULL, 0, i); 247 ASSERT(ret == 2); 248 249 testbuf[0] = testbuf[1] = 0; 250 ret = UTF8_putc(testbuf, 1, i); 251 ASSERT(ret == -1); 252 ASSERT(memcmp(testbuf, zerobuf, sizeof testbuf) == 0); 253 254 ret = UTF8_putc(testbuf, 2, i); 255 ASSERT(ret == 2); 256 ASSERT(memcmp(testbuf+2, zerobuf, sizeof(testbuf)-2) == 0); 257 ret = UTF8_getc(testbuf, 2, &value); 258 ASSERT(ret == 2); 259 ASSERT(value == i); 260 } 261 262 /* three-byte sequences */ 263 for (i = 0x800; i < 0x10000; i++) { 264 if (i >= 0xD800 && i < 0xE000) { 265 /* surrogates aren't valid */ 266 ret = UTF8_putc(NULL, 0, i); 267 ASSERT(ret == -2); 268 continue; 269 } 270 271 ret = UTF8_putc(NULL, 0, i); 272 ASSERT(ret == 3); 273 274 testbuf[0] = testbuf[1] = testbuf[2] = 0; 275 ret = UTF8_putc(testbuf, 2, i); 276 ASSERT(ret == -1); 277 ASSERT(memcmp(testbuf, zerobuf, sizeof testbuf) == 0); 278 279 ret = UTF8_putc(testbuf, 3, i); 280 ASSERT(ret == 3); 281 ASSERT(memcmp(testbuf+3, zerobuf, sizeof(testbuf)-3) == 0); 282 ret = UTF8_getc(testbuf, 3, &value); 283 ASSERT(ret == 3); 284 ASSERT(value == i); 285 } 286 287 /* four-byte sequences */ 288 for (i = 0x10000; i < 0x110000; i++) { 289 ret = UTF8_putc(NULL, 0, i); 290 ASSERT(ret == 4); 291 292 testbuf[0] = testbuf[1] = testbuf[2] = testbuf[3] = 0; 293 ret = UTF8_putc(testbuf, 3, i); 294 ASSERT(ret == -1); 295 ASSERT(memcmp(testbuf, zerobuf, sizeof testbuf) == 0); 296 297 ret = UTF8_putc(testbuf, 4, i); 298 ASSERT(ret == 4); 299 ASSERT(memcmp(testbuf+4, zerobuf, sizeof(testbuf)-4) == 0); 300 ret = UTF8_getc(testbuf, 4, &value); 301 ASSERT(ret == 4); 302 ASSERT(value == i); 303 } 304 305 /* spot check some larger values to confirm error return */ 306 for (i = 0x110000; i < 0x110100; i++) { 307 ret = UTF8_putc(NULL, 0, i); 308 ASSERT(ret == -2); 309 } 310 for (value = (unsigned long)-1; value > (unsigned long)-256; value--) { 311 ret = UTF8_putc(NULL, 0, value); 312 ASSERT(ret == -2); 313 } 314 315 return 0; 316 } 317