1 /* $NetBSD: regex.c,v 1.8 2025/01/26 16:25:38 christos Exp $ */ 2 3 /* 4 * Copyright (C) Internet Systems Consortium, Inc. ("ISC") 5 * 6 * SPDX-License-Identifier: MPL-2.0 7 * 8 * This Source Code Form is subject to the terms of the Mozilla Public 9 * License, v. 2.0. If a copy of the MPL was not distributed with this 10 * file, you can obtain one at https://mozilla.org/MPL/2.0/. 11 * 12 * See the COPYRIGHT file distributed with this work for additional 13 * information regarding copyright ownership. 14 */ 15 16 #include <stdbool.h> 17 18 #include <isc/file.h> 19 #include <isc/regex.h> 20 #include <isc/string.h> 21 22 #if VALREGEX_REPORT_REASON 23 #define FAIL(x) \ 24 do { \ 25 reason = (x); \ 26 goto error; \ 27 } while (0) 28 #else /* if VALREGEX_REPORT_REASON */ 29 #define FAIL(x) goto error 30 #endif /* if VALREGEX_REPORT_REASON */ 31 32 /* 33 * Validate the regular expression 'C' locale. 34 */ 35 int 36 isc_regex_validate(const char *c) { 37 enum { 38 none, 39 parse_bracket, 40 parse_bound, 41 parse_ce, 42 parse_ec, 43 parse_cc 44 } state = none; 45 /* Well known character classes. */ 46 const char *cc[] = { ":alnum:", ":digit:", ":punct:", ":alpha:", 47 ":graph:", ":space:", ":blank:", ":lower:", 48 ":upper:", ":cntrl:", ":print:", ":xdigit:" }; 49 bool seen_comma = false; 50 bool seen_high = false; 51 bool seen_char = false; 52 bool seen_ec = false; 53 bool seen_ce = false; 54 bool have_atom = false; 55 int group = 0; 56 int range = 0; 57 int sub = 0; 58 bool empty_ok = false; 59 bool neg = false; 60 bool was_multiple = false; 61 unsigned int low = 0; 62 unsigned int high = 0; 63 const char *ccname = NULL; 64 int range_start = 0; 65 #if VALREGEX_REPORT_REASON 66 const char *reason = ""; 67 #endif /* if VALREGEX_REPORT_REASON */ 68 69 if (c == NULL || *c == 0) { 70 FAIL("empty string"); 71 } 72 73 while (c != NULL && *c != 0) { 74 switch (state) { 75 case none: 76 switch (*c) { 77 case '\\': /* make literal */ 78 ++c; 79 switch (*c) { 80 case '1': 81 case '2': 82 case '3': 83 case '4': 84 case '5': 85 case '6': 86 case '7': 87 case '8': 88 case '9': 89 if ((*c - '0') > sub) { 90 FAIL("bad back reference"); 91 } 92 have_atom = true; 93 was_multiple = false; 94 break; 95 case 0: 96 FAIL("escaped end-of-string"); 97 default: 98 goto literal; 99 } 100 ++c; 101 break; 102 case '[': /* bracket start */ 103 ++c; 104 neg = false; 105 was_multiple = false; 106 seen_char = false; 107 state = parse_bracket; 108 break; 109 case '{': /* bound start */ 110 switch (c[1]) { 111 case '0': 112 case '1': 113 case '2': 114 case '3': 115 case '4': 116 case '5': 117 case '6': 118 case '7': 119 case '8': 120 case '9': 121 if (!have_atom) { 122 FAIL("no atom"); 123 } 124 if (was_multiple) { 125 FAIL("was multiple"); 126 } 127 seen_comma = false; 128 seen_high = false; 129 low = high = 0; 130 state = parse_bound; 131 break; 132 default: 133 goto literal; 134 } 135 ++c; 136 have_atom = true; 137 was_multiple = true; 138 break; 139 case '}': 140 goto literal; 141 case '(': /* group start */ 142 have_atom = false; 143 was_multiple = false; 144 empty_ok = true; 145 ++group; 146 ++sub; 147 ++c; 148 break; 149 case ')': /* group end */ 150 if (group && !have_atom && !empty_ok) { 151 FAIL("empty alternative"); 152 } 153 have_atom = true; 154 was_multiple = false; 155 if (group != 0) { 156 --group; 157 } 158 ++c; 159 break; 160 case '|': /* alternative separator */ 161 if (!have_atom) { 162 FAIL("no atom"); 163 } 164 have_atom = false; 165 empty_ok = false; 166 was_multiple = false; 167 ++c; 168 break; 169 case '^': 170 case '$': 171 have_atom = true; 172 was_multiple = true; 173 ++c; 174 break; 175 case '+': 176 case '*': 177 case '?': 178 if (was_multiple) { 179 FAIL("was multiple"); 180 } 181 if (!have_atom) { 182 FAIL("no atom"); 183 } 184 have_atom = true; 185 was_multiple = true; 186 ++c; 187 break; 188 case '.': 189 default: 190 literal: 191 have_atom = true; 192 was_multiple = false; 193 ++c; 194 break; 195 } 196 break; 197 case parse_bound: 198 switch (*c) { 199 case '0': 200 case '1': 201 case '2': 202 case '3': 203 case '4': 204 case '5': 205 case '6': 206 case '7': 207 case '8': 208 case '9': 209 if (!seen_comma) { 210 low = low * 10 + *c - '0'; 211 if (low > 255) { 212 FAIL("lower bound too big"); 213 } 214 } else { 215 seen_high = true; 216 high = high * 10 + *c - '0'; 217 if (high > 255) { 218 FAIL("upper bound too big"); 219 } 220 } 221 ++c; 222 break; 223 case ',': 224 if (seen_comma) { 225 FAIL("multiple commas"); 226 } 227 seen_comma = true; 228 ++c; 229 break; 230 default: 231 case '{': 232 FAIL("non digit/comma"); 233 case '}': 234 if (seen_high && low > high) { 235 FAIL("bad parse bound"); 236 } 237 seen_comma = false; 238 state = none; 239 ++c; 240 break; 241 } 242 break; 243 case parse_bracket: 244 switch (*c) { 245 case '^': 246 if (seen_char || neg) { 247 goto inside; 248 } 249 neg = true; 250 ++c; 251 break; 252 case '-': 253 if (range == 2) { 254 goto inside; 255 } 256 if (!seen_char) { 257 goto inside; 258 } 259 if (range == 1) { 260 FAIL("bad range"); 261 } 262 range = 2; 263 ++c; 264 break; 265 case '[': 266 ++c; 267 switch (*c) { 268 case '.': /* collating element */ 269 if (range != 0) { 270 --range; 271 } 272 ++c; 273 state = parse_ce; 274 seen_ce = false; 275 break; 276 case '=': /* equivalence class */ 277 if (range == 2) { 278 FAIL("equivalence class in " 279 "range"); 280 } 281 ++c; 282 state = parse_ec; 283 seen_ec = false; 284 break; 285 case ':': /* character class */ 286 if (range == 2) { 287 FAIL("character class in " 288 "range"); 289 } 290 ccname = c; 291 ++c; 292 state = parse_cc; 293 break; 294 } 295 seen_char = true; 296 break; 297 case ']': 298 if (!c[1] && !seen_char) { 299 FAIL("unfinished brace"); 300 } 301 if (!seen_char) { 302 goto inside; 303 } 304 ++c; 305 range = 0; 306 have_atom = true; 307 state = none; 308 break; 309 default: 310 inside: 311 seen_char = true; 312 if (range == 2 && (*c & 0xff) < range_start) { 313 FAIL("out of order range"); 314 } 315 if (range != 0) { 316 --range; 317 } 318 range_start = *c & 0xff; 319 ++c; 320 break; 321 } 322 break; 323 case parse_ce: 324 switch (*c) { 325 case '.': 326 ++c; 327 switch (*c) { 328 case ']': 329 if (!seen_ce) { 330 FAIL("empty ce"); 331 } 332 ++c; 333 state = parse_bracket; 334 break; 335 default: 336 if (seen_ce) { 337 range_start = 256; 338 } else { 339 range_start = '.'; 340 } 341 seen_ce = true; 342 break; 343 } 344 break; 345 default: 346 if (seen_ce) { 347 range_start = 256; 348 } else { 349 range_start = *c; 350 } 351 seen_ce = true; 352 ++c; 353 break; 354 } 355 break; 356 case parse_ec: 357 switch (*c) { 358 case '=': 359 ++c; 360 switch (*c) { 361 case ']': 362 if (!seen_ec) { 363 FAIL("no ec"); 364 } 365 ++c; 366 state = parse_bracket; 367 break; 368 default: 369 seen_ec = true; 370 break; 371 } 372 break; 373 default: 374 seen_ec = true; 375 ++c; 376 break; 377 } 378 break; 379 case parse_cc: 380 switch (*c) { 381 case ':': 382 ++c; 383 switch (*c) { 384 case ']': { 385 unsigned int i; 386 bool found = false; 387 for (i = 0; 388 i < sizeof(cc) / sizeof(*cc); i++) 389 { 390 unsigned int len; 391 len = strlen(cc[i]); 392 if (len != 393 (unsigned int)(c - ccname)) 394 { 395 continue; 396 } 397 if (strncmp(cc[i], ccname, len)) 398 { 399 continue; 400 } 401 found = true; 402 } 403 if (!found) { 404 FAIL("unknown cc"); 405 } 406 ++c; 407 state = parse_bracket; 408 break; 409 } 410 default: 411 break; 412 } 413 break; 414 default: 415 ++c; 416 break; 417 } 418 break; 419 } 420 } 421 if (group != 0) { 422 FAIL("group open"); 423 } 424 if (state != none) { 425 FAIL("incomplete"); 426 } 427 if (!have_atom) { 428 FAIL("no atom"); 429 } 430 return sub; 431 432 error: 433 #if VALREGEX_REPORT_REASON 434 fprintf(stderr, "%s\n", reason); 435 #endif /* if VALREGEX_REPORT_REASON */ 436 return -1; 437 } 438