1 /* $NetBSD: regex.c,v 1.4 2020/05/24 19:46:26 christos Exp $ */ 2 3 /* 4 * Copyright (C) Internet Systems Consortium, Inc. ("ISC") 5 * 6 * This Source Code Form is subject to the terms of the Mozilla Public 7 * License, v. 2.0. If a copy of the MPL was not distributed with this 8 * file, You can obtain one at http://mozilla.org/MPL/2.0/. 9 * 10 * See the COPYRIGHT file distributed with this work for additional 11 * information regarding copyright ownership. 12 */ 13 14 #include <stdbool.h> 15 16 #include <isc/file.h> 17 #include <isc/print.h> 18 #include <isc/regex.h> 19 #include <isc/string.h> 20 21 #if VALREGEX_REPORT_REASON 22 #define FAIL(x) \ 23 do { \ 24 reason = (x); \ 25 goto error; \ 26 } while (/*CONSTCOND*/0) 27 #else /* if VALREGEX_REPORT_REASON */ 28 #define FAIL(x) goto error 29 #endif /* if VALREGEX_REPORT_REASON */ 30 31 /* 32 * Validate the regular expression 'C' locale. 33 */ 34 int 35 isc_regex_validate(const char *c) { 36 enum { none, 37 parse_bracket, 38 parse_bound, 39 parse_ce, 40 parse_ec, 41 parse_cc } state = none; 42 /* Well known character classes. */ 43 const char *cc[] = { ":alnum:", ":digit:", ":punct:", ":alpha:", 44 ":graph:", ":space:", ":blank:", ":lower:", 45 ":upper:", ":cntrl:", ":print:", ":xdigit:" }; 46 bool seen_comma = false; 47 bool seen_high = false; 48 bool seen_char = false; 49 bool seen_ec = false; 50 bool seen_ce = false; 51 bool have_atom = false; 52 int group = 0; 53 int range = 0; 54 int sub = 0; 55 bool empty_ok = false; 56 bool neg = false; 57 bool was_multiple = false; 58 unsigned int low = 0; 59 unsigned int high = 0; 60 const char *ccname = NULL; 61 int range_start = 0; 62 #if VALREGEX_REPORT_REASON 63 const char *reason = ""; 64 #endif /* if VALREGEX_REPORT_REASON */ 65 66 if (c == NULL || *c == 0) { 67 FAIL("empty string"); 68 } 69 70 while (c != NULL && *c != 0) { 71 switch (state) { 72 case none: 73 switch (*c) { 74 case '\\': /* make literal */ 75 ++c; 76 switch (*c) { 77 case '1': 78 case '2': 79 case '3': 80 case '4': 81 case '5': 82 case '6': 83 case '7': 84 case '8': 85 case '9': 86 if ((*c - '0') > sub) { 87 FAIL("bad back reference"); 88 } 89 have_atom = true; 90 was_multiple = false; 91 break; 92 case 0: 93 FAIL("escaped end-of-string"); 94 default: 95 goto literal; 96 } 97 ++c; 98 break; 99 case '[': /* bracket start */ 100 ++c; 101 neg = false; 102 was_multiple = false; 103 seen_char = false; 104 state = parse_bracket; 105 break; 106 case '{': /* bound start */ 107 switch (c[1]) { 108 case '0': 109 case '1': 110 case '2': 111 case '3': 112 case '4': 113 case '5': 114 case '6': 115 case '7': 116 case '8': 117 case '9': 118 if (!have_atom) { 119 FAIL("no atom"); 120 } 121 if (was_multiple) { 122 FAIL("was multiple"); 123 } 124 seen_comma = false; 125 seen_high = false; 126 low = high = 0; 127 state = parse_bound; 128 break; 129 default: 130 goto literal; 131 } 132 ++c; 133 have_atom = true; 134 was_multiple = true; 135 break; 136 case '}': 137 goto literal; 138 case '(': /* group start */ 139 have_atom = false; 140 was_multiple = false; 141 empty_ok = true; 142 ++group; 143 ++sub; 144 ++c; 145 break; 146 case ')': /* group end */ 147 if (group && !have_atom && !empty_ok) { 148 FAIL("empty alternative"); 149 } 150 have_atom = true; 151 was_multiple = false; 152 if (group != 0) { 153 --group; 154 } 155 ++c; 156 break; 157 case '|': /* alternative separator */ 158 if (!have_atom) { 159 FAIL("no atom"); 160 } 161 have_atom = false; 162 empty_ok = false; 163 was_multiple = false; 164 ++c; 165 break; 166 case '^': 167 case '$': 168 have_atom = true; 169 was_multiple = true; 170 ++c; 171 break; 172 case '+': 173 case '*': 174 case '?': 175 if (was_multiple) { 176 FAIL("was multiple"); 177 } 178 if (!have_atom) { 179 FAIL("no atom"); 180 } 181 have_atom = true; 182 was_multiple = true; 183 ++c; 184 break; 185 case '.': 186 default: 187 literal: 188 have_atom = true; 189 was_multiple = false; 190 ++c; 191 break; 192 } 193 break; 194 case parse_bound: 195 switch (*c) { 196 case '0': 197 case '1': 198 case '2': 199 case '3': 200 case '4': 201 case '5': 202 case '6': 203 case '7': 204 case '8': 205 case '9': 206 if (!seen_comma) { 207 low = low * 10 + *c - '0'; 208 if (low > 255) { 209 FAIL("lower bound too big"); 210 } 211 } else { 212 seen_high = true; 213 high = high * 10 + *c - '0'; 214 if (high > 255) { 215 FAIL("upper bound too big"); 216 } 217 } 218 ++c; 219 break; 220 case ',': 221 if (seen_comma) { 222 FAIL("multiple commas"); 223 } 224 seen_comma = true; 225 ++c; 226 break; 227 default: 228 case '{': 229 FAIL("non digit/comma"); 230 case '}': 231 if (seen_high && low > high) { 232 FAIL("bad parse bound"); 233 } 234 seen_comma = false; 235 state = none; 236 ++c; 237 break; 238 } 239 break; 240 case parse_bracket: 241 switch (*c) { 242 case '^': 243 if (seen_char || neg) { 244 goto inside; 245 } 246 neg = true; 247 ++c; 248 break; 249 case '-': 250 if (range == 2) { 251 goto inside; 252 } 253 if (!seen_char) { 254 goto inside; 255 } 256 if (range == 1) { 257 FAIL("bad range"); 258 } 259 range = 2; 260 ++c; 261 break; 262 case '[': 263 ++c; 264 switch (*c) { 265 case '.': /* collating element */ 266 if (range != 0) { 267 --range; 268 } 269 ++c; 270 state = parse_ce; 271 seen_ce = false; 272 break; 273 case '=': /* equivalence class */ 274 if (range == 2) { 275 FAIL("equivalence class in " 276 "range"); 277 } 278 ++c; 279 state = parse_ec; 280 seen_ec = false; 281 break; 282 case ':': /* character class */ 283 if (range == 2) { 284 FAIL("character class in " 285 "range"); 286 } 287 ccname = c; 288 ++c; 289 state = parse_cc; 290 break; 291 } 292 seen_char = true; 293 break; 294 case ']': 295 if (!c[1] && !seen_char) { 296 FAIL("unfinished brace"); 297 } 298 if (!seen_char) { 299 goto inside; 300 } 301 ++c; 302 range = 0; 303 have_atom = true; 304 state = none; 305 break; 306 default: 307 inside: 308 seen_char = true; 309 if (range == 2 && (*c & 0xff) < range_start) { 310 FAIL("out of order range"); 311 } 312 if (range != 0) { 313 --range; 314 } 315 range_start = *c & 0xff; 316 ++c; 317 break; 318 } 319 break; 320 case parse_ce: 321 switch (*c) { 322 case '.': 323 ++c; 324 switch (*c) { 325 case ']': 326 if (!seen_ce) { 327 FAIL("empty ce"); 328 } 329 ++c; 330 state = parse_bracket; 331 break; 332 default: 333 if (seen_ce) { 334 range_start = 256; 335 } else { 336 range_start = '.'; 337 } 338 seen_ce = true; 339 break; 340 } 341 break; 342 default: 343 if (seen_ce) { 344 range_start = 256; 345 } else { 346 range_start = *c; 347 } 348 seen_ce = true; 349 ++c; 350 break; 351 } 352 break; 353 case parse_ec: 354 switch (*c) { 355 case '=': 356 ++c; 357 switch (*c) { 358 case ']': 359 if (!seen_ec) { 360 FAIL("no ec"); 361 } 362 ++c; 363 state = parse_bracket; 364 break; 365 default: 366 seen_ec = true; 367 break; 368 } 369 break; 370 default: 371 seen_ec = true; 372 ++c; 373 break; 374 } 375 break; 376 case parse_cc: 377 switch (*c) { 378 case ':': 379 ++c; 380 switch (*c) { 381 case ']': { 382 unsigned int i; 383 bool found = false; 384 for (i = 0; 385 i < sizeof(cc) / sizeof(*cc); i++) 386 { 387 unsigned int len; 388 len = strlen(cc[i]); 389 if (len != 390 (unsigned int)(c - ccname)) 391 { 392 continue; 393 } 394 if (strncmp(cc[i], ccname, len)) 395 { 396 continue; 397 } 398 found = true; 399 } 400 if (!found) { 401 FAIL("unknown cc"); 402 } 403 ++c; 404 state = parse_bracket; 405 break; 406 } 407 default: 408 break; 409 } 410 break; 411 default: 412 ++c; 413 break; 414 } 415 break; 416 } 417 } 418 if (group != 0) { 419 FAIL("group open"); 420 } 421 if (state != none) { 422 FAIL("incomplete"); 423 } 424 if (!have_atom) { 425 FAIL("no atom"); 426 } 427 return (sub); 428 429 error: 430 #if VALREGEX_REPORT_REASON 431 fprintf(stderr, "%s\n", reason); 432 #endif /* if VALREGEX_REPORT_REASON */ 433 return (-1); 434 } 435