1 /* $NetBSD: regex.c,v 1.7 2022/09/23 12:15:33 christos Exp $ */ 2 3 /* 4 * Copyright (C) Internet Systems Consortium, Inc. ("ISC") 5 * 6 * SPDX-License-Identifier: MPL-2.0 7 * 8 * This Source Code Form is subject to the terms of the Mozilla Public 9 * License, v. 2.0. If a copy of the MPL was not distributed with this 10 * file, you can obtain one at https://mozilla.org/MPL/2.0/. 11 * 12 * See the COPYRIGHT file distributed with this work for additional 13 * information regarding copyright ownership. 14 */ 15 16 #include <stdbool.h> 17 18 #include <isc/file.h> 19 #include <isc/print.h> 20 #include <isc/regex.h> 21 #include <isc/string.h> 22 23 #if VALREGEX_REPORT_REASON 24 #define FAIL(x) \ 25 do { \ 26 reason = (x); \ 27 goto error; \ 28 } while (0) 29 #else /* if VALREGEX_REPORT_REASON */ 30 #define FAIL(x) goto error 31 #endif /* if VALREGEX_REPORT_REASON */ 32 33 /* 34 * Validate the regular expression 'C' locale. 35 */ 36 int 37 isc_regex_validate(const char *c) { 38 enum { 39 none, 40 parse_bracket, 41 parse_bound, 42 parse_ce, 43 parse_ec, 44 parse_cc 45 } state = none; 46 /* Well known character classes. */ 47 const char *cc[] = { ":alnum:", ":digit:", ":punct:", ":alpha:", 48 ":graph:", ":space:", ":blank:", ":lower:", 49 ":upper:", ":cntrl:", ":print:", ":xdigit:" }; 50 bool seen_comma = false; 51 bool seen_high = false; 52 bool seen_char = false; 53 bool seen_ec = false; 54 bool seen_ce = false; 55 bool have_atom = false; 56 int group = 0; 57 int range = 0; 58 int sub = 0; 59 bool empty_ok = false; 60 bool neg = false; 61 bool was_multiple = false; 62 unsigned int low = 0; 63 unsigned int high = 0; 64 const char *ccname = NULL; 65 int range_start = 0; 66 #if VALREGEX_REPORT_REASON 67 const char *reason = ""; 68 #endif /* if VALREGEX_REPORT_REASON */ 69 70 if (c == NULL || *c == 0) { 71 FAIL("empty string"); 72 } 73 74 while (c != NULL && *c != 0) { 75 switch (state) { 76 case none: 77 switch (*c) { 78 case '\\': /* make literal */ 79 ++c; 80 switch (*c) { 81 case '1': 82 case '2': 83 case '3': 84 case '4': 85 case '5': 86 case '6': 87 case '7': 88 case '8': 89 case '9': 90 if ((*c - '0') > sub) { 91 FAIL("bad back reference"); 92 } 93 have_atom = true; 94 was_multiple = false; 95 break; 96 case 0: 97 FAIL("escaped end-of-string"); 98 default: 99 goto literal; 100 } 101 ++c; 102 break; 103 case '[': /* bracket start */ 104 ++c; 105 neg = false; 106 was_multiple = false; 107 seen_char = false; 108 state = parse_bracket; 109 break; 110 case '{': /* bound start */ 111 switch (c[1]) { 112 case '0': 113 case '1': 114 case '2': 115 case '3': 116 case '4': 117 case '5': 118 case '6': 119 case '7': 120 case '8': 121 case '9': 122 if (!have_atom) { 123 FAIL("no atom"); 124 } 125 if (was_multiple) { 126 FAIL("was multiple"); 127 } 128 seen_comma = false; 129 seen_high = false; 130 low = high = 0; 131 state = parse_bound; 132 break; 133 default: 134 goto literal; 135 } 136 ++c; 137 have_atom = true; 138 was_multiple = true; 139 break; 140 case '}': 141 goto literal; 142 case '(': /* group start */ 143 have_atom = false; 144 was_multiple = false; 145 empty_ok = true; 146 ++group; 147 ++sub; 148 ++c; 149 break; 150 case ')': /* group end */ 151 if (group && !have_atom && !empty_ok) { 152 FAIL("empty alternative"); 153 } 154 have_atom = true; 155 was_multiple = false; 156 if (group != 0) { 157 --group; 158 } 159 ++c; 160 break; 161 case '|': /* alternative separator */ 162 if (!have_atom) { 163 FAIL("no atom"); 164 } 165 have_atom = false; 166 empty_ok = false; 167 was_multiple = false; 168 ++c; 169 break; 170 case '^': 171 case '$': 172 have_atom = true; 173 was_multiple = true; 174 ++c; 175 break; 176 case '+': 177 case '*': 178 case '?': 179 if (was_multiple) { 180 FAIL("was multiple"); 181 } 182 if (!have_atom) { 183 FAIL("no atom"); 184 } 185 have_atom = true; 186 was_multiple = true; 187 ++c; 188 break; 189 case '.': 190 default: 191 literal: 192 have_atom = true; 193 was_multiple = false; 194 ++c; 195 break; 196 } 197 break; 198 case parse_bound: 199 switch (*c) { 200 case '0': 201 case '1': 202 case '2': 203 case '3': 204 case '4': 205 case '5': 206 case '6': 207 case '7': 208 case '8': 209 case '9': 210 if (!seen_comma) { 211 low = low * 10 + *c - '0'; 212 if (low > 255) { 213 FAIL("lower bound too big"); 214 } 215 } else { 216 seen_high = true; 217 high = high * 10 + *c - '0'; 218 if (high > 255) { 219 FAIL("upper bound too big"); 220 } 221 } 222 ++c; 223 break; 224 case ',': 225 if (seen_comma) { 226 FAIL("multiple commas"); 227 } 228 seen_comma = true; 229 ++c; 230 break; 231 default: 232 case '{': 233 FAIL("non digit/comma"); 234 case '}': 235 if (seen_high && low > high) { 236 FAIL("bad parse bound"); 237 } 238 seen_comma = false; 239 state = none; 240 ++c; 241 break; 242 } 243 break; 244 case parse_bracket: 245 switch (*c) { 246 case '^': 247 if (seen_char || neg) { 248 goto inside; 249 } 250 neg = true; 251 ++c; 252 break; 253 case '-': 254 if (range == 2) { 255 goto inside; 256 } 257 if (!seen_char) { 258 goto inside; 259 } 260 if (range == 1) { 261 FAIL("bad range"); 262 } 263 range = 2; 264 ++c; 265 break; 266 case '[': 267 ++c; 268 switch (*c) { 269 case '.': /* collating element */ 270 if (range != 0) { 271 --range; 272 } 273 ++c; 274 state = parse_ce; 275 seen_ce = false; 276 break; 277 case '=': /* equivalence class */ 278 if (range == 2) { 279 FAIL("equivalence class in " 280 "range"); 281 } 282 ++c; 283 state = parse_ec; 284 seen_ec = false; 285 break; 286 case ':': /* character class */ 287 if (range == 2) { 288 FAIL("character class in " 289 "range"); 290 } 291 ccname = c; 292 ++c; 293 state = parse_cc; 294 break; 295 } 296 seen_char = true; 297 break; 298 case ']': 299 if (!c[1] && !seen_char) { 300 FAIL("unfinished brace"); 301 } 302 if (!seen_char) { 303 goto inside; 304 } 305 ++c; 306 range = 0; 307 have_atom = true; 308 state = none; 309 break; 310 default: 311 inside: 312 seen_char = true; 313 if (range == 2 && (*c & 0xff) < range_start) { 314 FAIL("out of order range"); 315 } 316 if (range != 0) { 317 --range; 318 } 319 range_start = *c & 0xff; 320 ++c; 321 break; 322 } 323 break; 324 case parse_ce: 325 switch (*c) { 326 case '.': 327 ++c; 328 switch (*c) { 329 case ']': 330 if (!seen_ce) { 331 FAIL("empty ce"); 332 } 333 ++c; 334 state = parse_bracket; 335 break; 336 default: 337 if (seen_ce) { 338 range_start = 256; 339 } else { 340 range_start = '.'; 341 } 342 seen_ce = true; 343 break; 344 } 345 break; 346 default: 347 if (seen_ce) { 348 range_start = 256; 349 } else { 350 range_start = *c; 351 } 352 seen_ce = true; 353 ++c; 354 break; 355 } 356 break; 357 case parse_ec: 358 switch (*c) { 359 case '=': 360 ++c; 361 switch (*c) { 362 case ']': 363 if (!seen_ec) { 364 FAIL("no ec"); 365 } 366 ++c; 367 state = parse_bracket; 368 break; 369 default: 370 seen_ec = true; 371 break; 372 } 373 break; 374 default: 375 seen_ec = true; 376 ++c; 377 break; 378 } 379 break; 380 case parse_cc: 381 switch (*c) { 382 case ':': 383 ++c; 384 switch (*c) { 385 case ']': { 386 unsigned int i; 387 bool found = false; 388 for (i = 0; 389 i < sizeof(cc) / sizeof(*cc); i++) 390 { 391 unsigned int len; 392 len = strlen(cc[i]); 393 if (len != 394 (unsigned int)(c - ccname)) 395 { 396 continue; 397 } 398 if (strncmp(cc[i], ccname, len)) 399 { 400 continue; 401 } 402 found = true; 403 } 404 if (!found) { 405 FAIL("unknown cc"); 406 } 407 ++c; 408 state = parse_bracket; 409 break; 410 } 411 default: 412 break; 413 } 414 break; 415 default: 416 ++c; 417 break; 418 } 419 break; 420 } 421 } 422 if (group != 0) { 423 FAIL("group open"); 424 } 425 if (state != none) { 426 FAIL("incomplete"); 427 } 428 if (!have_atom) { 429 FAIL("no atom"); 430 } 431 return (sub); 432 433 error: 434 #if VALREGEX_REPORT_REASON 435 fprintf(stderr, "%s\n", reason); 436 #endif /* if VALREGEX_REPORT_REASON */ 437 return (-1); 438 } 439