1 /* $NetBSD: regex.c,v 1.6 2021/04/05 11:27:02 rillig Exp $ */ 2 3 /* 4 * Copyright (C) Internet Systems Consortium, Inc. ("ISC") 5 * 6 * This Source Code Form is subject to the terms of the Mozilla Public 7 * License, v. 2.0. If a copy of the MPL was not distributed with this 8 * file, you can obtain one at https://mozilla.org/MPL/2.0/. 9 * 10 * See the COPYRIGHT file distributed with this work for additional 11 * information regarding copyright ownership. 12 */ 13 14 #include <stdbool.h> 15 16 #include <isc/file.h> 17 #include <isc/print.h> 18 #include <isc/regex.h> 19 #include <isc/string.h> 20 21 #if VALREGEX_REPORT_REASON 22 #define FAIL(x) \ 23 do { \ 24 reason = (x); \ 25 goto error; \ 26 } while (0) 27 #else /* if VALREGEX_REPORT_REASON */ 28 #define FAIL(x) goto error 29 #endif /* if VALREGEX_REPORT_REASON */ 30 31 /* 32 * Validate the regular expression 'C' locale. 33 */ 34 int 35 isc_regex_validate(const char *c) { 36 enum { 37 none, 38 parse_bracket, 39 parse_bound, 40 parse_ce, 41 parse_ec, 42 parse_cc 43 } state = none; 44 /* Well known character classes. */ 45 const char *cc[] = { ":alnum:", ":digit:", ":punct:", ":alpha:", 46 ":graph:", ":space:", ":blank:", ":lower:", 47 ":upper:", ":cntrl:", ":print:", ":xdigit:" }; 48 bool seen_comma = false; 49 bool seen_high = false; 50 bool seen_char = false; 51 bool seen_ec = false; 52 bool seen_ce = false; 53 bool have_atom = false; 54 int group = 0; 55 int range = 0; 56 int sub = 0; 57 bool empty_ok = false; 58 bool neg = false; 59 bool was_multiple = false; 60 unsigned int low = 0; 61 unsigned int high = 0; 62 const char *ccname = NULL; 63 int range_start = 0; 64 #if VALREGEX_REPORT_REASON 65 const char *reason = ""; 66 #endif /* if VALREGEX_REPORT_REASON */ 67 68 if (c == NULL || *c == 0) { 69 FAIL("empty string"); 70 } 71 72 while (c != NULL && *c != 0) { 73 switch (state) { 74 case none: 75 switch (*c) { 76 case '\\': /* make literal */ 77 ++c; 78 switch (*c) { 79 case '1': 80 case '2': 81 case '3': 82 case '4': 83 case '5': 84 case '6': 85 case '7': 86 case '8': 87 case '9': 88 if ((*c - '0') > sub) { 89 FAIL("bad back reference"); 90 } 91 have_atom = true; 92 was_multiple = false; 93 break; 94 case 0: 95 FAIL("escaped end-of-string"); 96 default: 97 goto literal; 98 } 99 ++c; 100 break; 101 case '[': /* bracket start */ 102 ++c; 103 neg = false; 104 was_multiple = false; 105 seen_char = false; 106 state = parse_bracket; 107 break; 108 case '{': /* bound start */ 109 switch (c[1]) { 110 case '0': 111 case '1': 112 case '2': 113 case '3': 114 case '4': 115 case '5': 116 case '6': 117 case '7': 118 case '8': 119 case '9': 120 if (!have_atom) { 121 FAIL("no atom"); 122 } 123 if (was_multiple) { 124 FAIL("was multiple"); 125 } 126 seen_comma = false; 127 seen_high = false; 128 low = high = 0; 129 state = parse_bound; 130 break; 131 default: 132 goto literal; 133 } 134 ++c; 135 have_atom = true; 136 was_multiple = true; 137 break; 138 case '}': 139 goto literal; 140 case '(': /* group start */ 141 have_atom = false; 142 was_multiple = false; 143 empty_ok = true; 144 ++group; 145 ++sub; 146 ++c; 147 break; 148 case ')': /* group end */ 149 if (group && !have_atom && !empty_ok) { 150 FAIL("empty alternative"); 151 } 152 have_atom = true; 153 was_multiple = false; 154 if (group != 0) { 155 --group; 156 } 157 ++c; 158 break; 159 case '|': /* alternative separator */ 160 if (!have_atom) { 161 FAIL("no atom"); 162 } 163 have_atom = false; 164 empty_ok = false; 165 was_multiple = false; 166 ++c; 167 break; 168 case '^': 169 case '$': 170 have_atom = true; 171 was_multiple = true; 172 ++c; 173 break; 174 case '+': 175 case '*': 176 case '?': 177 if (was_multiple) { 178 FAIL("was multiple"); 179 } 180 if (!have_atom) { 181 FAIL("no atom"); 182 } 183 have_atom = true; 184 was_multiple = true; 185 ++c; 186 break; 187 case '.': 188 default: 189 literal: 190 have_atom = true; 191 was_multiple = false; 192 ++c; 193 break; 194 } 195 break; 196 case parse_bound: 197 switch (*c) { 198 case '0': 199 case '1': 200 case '2': 201 case '3': 202 case '4': 203 case '5': 204 case '6': 205 case '7': 206 case '8': 207 case '9': 208 if (!seen_comma) { 209 low = low * 10 + *c - '0'; 210 if (low > 255) { 211 FAIL("lower bound too big"); 212 } 213 } else { 214 seen_high = true; 215 high = high * 10 + *c - '0'; 216 if (high > 255) { 217 FAIL("upper bound too big"); 218 } 219 } 220 ++c; 221 break; 222 case ',': 223 if (seen_comma) { 224 FAIL("multiple commas"); 225 } 226 seen_comma = true; 227 ++c; 228 break; 229 default: 230 case '{': 231 FAIL("non digit/comma"); 232 case '}': 233 if (seen_high && low > high) { 234 FAIL("bad parse bound"); 235 } 236 seen_comma = false; 237 state = none; 238 ++c; 239 break; 240 } 241 break; 242 case parse_bracket: 243 switch (*c) { 244 case '^': 245 if (seen_char || neg) { 246 goto inside; 247 } 248 neg = true; 249 ++c; 250 break; 251 case '-': 252 if (range == 2) { 253 goto inside; 254 } 255 if (!seen_char) { 256 goto inside; 257 } 258 if (range == 1) { 259 FAIL("bad range"); 260 } 261 range = 2; 262 ++c; 263 break; 264 case '[': 265 ++c; 266 switch (*c) { 267 case '.': /* collating element */ 268 if (range != 0) { 269 --range; 270 } 271 ++c; 272 state = parse_ce; 273 seen_ce = false; 274 break; 275 case '=': /* equivalence class */ 276 if (range == 2) { 277 FAIL("equivalence class in " 278 "range"); 279 } 280 ++c; 281 state = parse_ec; 282 seen_ec = false; 283 break; 284 case ':': /* character class */ 285 if (range == 2) { 286 FAIL("character class in " 287 "range"); 288 } 289 ccname = c; 290 ++c; 291 state = parse_cc; 292 break; 293 } 294 seen_char = true; 295 break; 296 case ']': 297 if (!c[1] && !seen_char) { 298 FAIL("unfinished brace"); 299 } 300 if (!seen_char) { 301 goto inside; 302 } 303 ++c; 304 range = 0; 305 have_atom = true; 306 state = none; 307 break; 308 default: 309 inside: 310 seen_char = true; 311 if (range == 2 && (*c & 0xff) < range_start) { 312 FAIL("out of order range"); 313 } 314 if (range != 0) { 315 --range; 316 } 317 range_start = *c & 0xff; 318 ++c; 319 break; 320 } 321 break; 322 case parse_ce: 323 switch (*c) { 324 case '.': 325 ++c; 326 switch (*c) { 327 case ']': 328 if (!seen_ce) { 329 FAIL("empty ce"); 330 } 331 ++c; 332 state = parse_bracket; 333 break; 334 default: 335 if (seen_ce) { 336 range_start = 256; 337 } else { 338 range_start = '.'; 339 } 340 seen_ce = true; 341 break; 342 } 343 break; 344 default: 345 if (seen_ce) { 346 range_start = 256; 347 } else { 348 range_start = *c; 349 } 350 seen_ce = true; 351 ++c; 352 break; 353 } 354 break; 355 case parse_ec: 356 switch (*c) { 357 case '=': 358 ++c; 359 switch (*c) { 360 case ']': 361 if (!seen_ec) { 362 FAIL("no ec"); 363 } 364 ++c; 365 state = parse_bracket; 366 break; 367 default: 368 seen_ec = true; 369 break; 370 } 371 break; 372 default: 373 seen_ec = true; 374 ++c; 375 break; 376 } 377 break; 378 case parse_cc: 379 switch (*c) { 380 case ':': 381 ++c; 382 switch (*c) { 383 case ']': { 384 unsigned int i; 385 bool found = false; 386 for (i = 0; 387 i < sizeof(cc) / sizeof(*cc); i++) 388 { 389 unsigned int len; 390 len = strlen(cc[i]); 391 if (len != 392 (unsigned int)(c - ccname)) 393 { 394 continue; 395 } 396 if (strncmp(cc[i], ccname, len)) 397 { 398 continue; 399 } 400 found = true; 401 } 402 if (!found) { 403 FAIL("unknown cc"); 404 } 405 ++c; 406 state = parse_bracket; 407 break; 408 } 409 default: 410 break; 411 } 412 break; 413 default: 414 ++c; 415 break; 416 } 417 break; 418 } 419 } 420 if (group != 0) { 421 FAIL("group open"); 422 } 423 if (state != none) { 424 FAIL("incomplete"); 425 } 426 if (!have_atom) { 427 FAIL("no atom"); 428 } 429 return (sub); 430 431 error: 432 #if VALREGEX_REPORT_REASON 433 fprintf(stderr, "%s\n", reason); 434 #endif /* if VALREGEX_REPORT_REASON */ 435 return (-1); 436 } 437