1 /* $NetBSD: regex.c,v 1.3 2014/12/10 04:37:59 christos Exp $ */ 2 3 /* 4 * Copyright (C) 2013, 2014 Internet Systems Consortium, Inc. ("ISC") 5 * 6 * Permission to use, copy, modify, and/or distribute this software for any 7 * purpose with or without fee is hereby granted, provided that the above 8 * copyright notice and this permission notice appear in all copies. 9 * 10 * THE SOFTWARE IS PROVIDED "AS IS" AND ISC DISCLAIMS ALL WARRANTIES WITH 11 * REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY 12 * AND FITNESS. IN NO EVENT SHALL ISC BE LIABLE FOR ANY SPECIAL, DIRECT, 13 * INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES WHATSOEVER RESULTING FROM 14 * LOSS OF USE, DATA OR PROFITS, WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE 15 * OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR 16 * PERFORMANCE OF THIS SOFTWARE. 17 */ 18 19 #include <config.h> 20 21 #include <isc/file.h> 22 #include <isc/regex.h> 23 #include <isc/string.h> 24 25 #if VALREGEX_REPORT_REASON 26 #define FAIL(x) do { reason = (x); goto error; } while(/*CONSTCOND*/0) 27 #else 28 #define FAIL(x) goto error 29 #endif 30 31 /* 32 * Validate the regular expression 'C' locale. 33 */ 34 int 35 isc_regex_validate(const char *c) { 36 enum { 37 none, parse_bracket, parse_bound, 38 parse_ce, parse_ec, parse_cc 39 } state = none; 40 /* Well known character classes. */ 41 const char *cc[] = { 42 ":alnum:", ":digit:", ":punct:", ":alpha:", ":graph:", 43 ":space:", ":blank:", ":lower:", ":upper:", ":cntrl:", 44 ":print:", ":xdigit:" 45 }; 46 isc_boolean_t seen_comma = ISC_FALSE; 47 isc_boolean_t seen_high = ISC_FALSE; 48 isc_boolean_t seen_char = ISC_FALSE; 49 isc_boolean_t seen_ec = ISC_FALSE; 50 isc_boolean_t seen_ce = ISC_FALSE; 51 isc_boolean_t have_atom = ISC_FALSE; 52 int group = 0; 53 int range = 0; 54 int sub = 0; 55 isc_boolean_t empty_ok = ISC_FALSE; 56 isc_boolean_t neg = ISC_FALSE; 57 isc_boolean_t was_multiple = ISC_FALSE; 58 unsigned int low = 0; 59 unsigned int high = 0; 60 const char *ccname = NULL; 61 int range_start = 0; 62 #if VALREGEX_REPORT_REASON 63 const char *reason = ""; 64 #endif 65 66 if (c == NULL || *c == 0) 67 FAIL("empty string"); 68 69 while (c != NULL && *c != 0) { 70 switch (state) { 71 case none: 72 switch (*c) { 73 case '\\': /* make literal */ 74 ++c; 75 switch (*c) { 76 case '1': case '2': case '3': 77 case '4': case '5': case '6': 78 case '7': case '8': case '9': 79 if ((*c - '0') > sub) 80 FAIL("bad back reference"); 81 have_atom = ISC_TRUE; 82 was_multiple = ISC_FALSE; 83 break; 84 case 0: 85 FAIL("escaped end-of-string"); 86 default: 87 goto literal; 88 } 89 ++c; 90 break; 91 case '[': /* bracket start */ 92 ++c; 93 neg = ISC_FALSE; 94 was_multiple = ISC_FALSE; 95 seen_char = ISC_FALSE; 96 state = parse_bracket; 97 break; 98 case '{': /* bound start */ 99 switch (c[1]) { 100 case '0': case '1': case '2': case '3': 101 case '4': case '5': case '6': case '7': 102 case '8': case '9': 103 if (!have_atom) 104 FAIL("no atom"); 105 if (was_multiple) 106 FAIL("was multiple"); 107 seen_comma = ISC_FALSE; 108 seen_high = ISC_FALSE; 109 low = high = 0; 110 state = parse_bound; 111 break; 112 default: 113 goto literal; 114 } 115 ++c; 116 have_atom = ISC_TRUE; 117 was_multiple = ISC_TRUE; 118 break; 119 case '}': 120 goto literal; 121 case '(': /* group start */ 122 have_atom = ISC_FALSE; 123 was_multiple = ISC_FALSE; 124 empty_ok = ISC_TRUE; 125 ++group; 126 ++sub; 127 ++c; 128 break; 129 case ')': /* group end */ 130 if (group && !have_atom && !empty_ok) 131 FAIL("empty alternative"); 132 have_atom = ISC_TRUE; 133 was_multiple = ISC_FALSE; 134 if (group != 0) 135 --group; 136 ++c; 137 break; 138 case '|': /* alternative seperator */ 139 if (!have_atom) 140 FAIL("no atom"); 141 have_atom = ISC_FALSE; 142 empty_ok = ISC_FALSE; 143 was_multiple = ISC_FALSE; 144 ++c; 145 break; 146 case '^': 147 case '$': 148 have_atom = ISC_TRUE; 149 was_multiple = ISC_TRUE; 150 ++c; 151 break; 152 case '+': 153 case '*': 154 case '?': 155 if (was_multiple) 156 FAIL("was multiple"); 157 if (!have_atom) 158 FAIL("no atom"); 159 have_atom = ISC_TRUE; 160 was_multiple = ISC_TRUE; 161 ++c; 162 break; 163 case '.': 164 default: 165 literal: 166 have_atom = ISC_TRUE; 167 was_multiple = ISC_FALSE; 168 ++c; 169 break; 170 } 171 break; 172 case parse_bound: 173 switch (*c) { 174 case '0': case '1': case '2': case '3': case '4': 175 case '5': case '6': case '7': case '8': case '9': 176 if (!seen_comma) { 177 low = low * 10 + *c - '0'; 178 if (low > 255) 179 FAIL("lower bound too big"); 180 } else { 181 seen_high = ISC_TRUE; 182 high = high * 10 + *c - '0'; 183 if (high > 255) 184 FAIL("upper bound too big"); 185 } 186 ++c; 187 break; 188 case ',': 189 if (seen_comma) 190 FAIL("multiple commas"); 191 seen_comma = ISC_TRUE; 192 ++c; 193 break; 194 default: 195 case '{': 196 FAIL("non digit/comma"); 197 case '}': 198 if (seen_high && low > high) 199 FAIL("bad parse bound"); 200 seen_comma = ISC_FALSE; 201 state = none; 202 ++c; 203 break; 204 } 205 break; 206 case parse_bracket: 207 switch (*c) { 208 case '^': 209 if (seen_char || neg) goto inside; 210 neg = ISC_TRUE; 211 ++c; 212 break; 213 case '-': 214 if (range == 2) goto inside; 215 if (!seen_char) goto inside; 216 if (range == 1) 217 FAIL("bad range"); 218 range = 2; 219 ++c; 220 break; 221 case '[': 222 ++c; 223 switch (*c) { 224 case '.': /* collating element */ 225 if (range != 0) --range; 226 ++c; 227 state = parse_ce; 228 seen_ce = ISC_FALSE; 229 break; 230 case '=': /* equivalence class */ 231 if (range == 2) 232 FAIL("equivalence class in range"); 233 ++c; 234 state = parse_ec; 235 seen_ec = ISC_FALSE; 236 break; 237 case ':': /* character class */ 238 if (range == 2) 239 FAIL("character class in range"); 240 ccname = c; 241 ++c; 242 state = parse_cc; 243 break; 244 } 245 seen_char = ISC_TRUE; 246 break; 247 case ']': 248 if (!c[1] && !seen_char) 249 FAIL("unfinished brace"); 250 if (!seen_char) 251 goto inside; 252 ++c; 253 range = 0; 254 have_atom = ISC_TRUE; 255 state = none; 256 break; 257 default: 258 inside: 259 seen_char = ISC_TRUE; 260 if (range == 2 && (*c & 0xff) < range_start) 261 FAIL("out of order range"); 262 if (range != 0) 263 --range; 264 range_start = *c & 0xff; 265 ++c; 266 break; 267 }; 268 break; 269 case parse_ce: 270 switch (*c) { 271 case '.': 272 ++c; 273 switch (*c) { 274 case ']': 275 if (!seen_ce) 276 FAIL("empty ce"); 277 ++c; 278 state = parse_bracket; 279 break; 280 default: 281 if (seen_ce) 282 range_start = 256; 283 else 284 range_start = '.'; 285 seen_ce = ISC_TRUE; 286 break; 287 } 288 break; 289 default: 290 if (seen_ce) 291 range_start = 256; 292 else 293 range_start = *c; 294 seen_ce = ISC_TRUE; 295 ++c; 296 break; 297 } 298 break; 299 case parse_ec: 300 switch (*c) { 301 case '=': 302 ++c; 303 switch (*c) { 304 case ']': 305 if (!seen_ec) 306 FAIL("no ec"); 307 ++c; 308 state = parse_bracket; 309 break; 310 default: 311 seen_ec = ISC_TRUE; 312 break; 313 } 314 break; 315 default: 316 seen_ec = ISC_TRUE; 317 ++c; 318 break; 319 } 320 break; 321 case parse_cc: 322 switch (*c) { 323 case ':': 324 ++c; 325 switch (*c) { 326 case ']': { 327 unsigned int i; 328 isc_boolean_t found = ISC_FALSE; 329 for (i = 0; 330 i < sizeof(cc)/sizeof(*cc); 331 i++) 332 { 333 unsigned int len; 334 len = strlen(cc[i]); 335 if (len != 336 (unsigned int)(c - ccname)) 337 continue; 338 if (strncmp(cc[i], ccname, len)) 339 continue; 340 found = ISC_TRUE; 341 } 342 if (!found) 343 FAIL("unknown cc"); 344 ++c; 345 state = parse_bracket; 346 break; 347 } 348 default: 349 break; 350 } 351 break; 352 default: 353 ++c; 354 break; 355 } 356 break; 357 } 358 } 359 if (group != 0) 360 FAIL("group open"); 361 if (state != none) 362 FAIL("incomplete"); 363 if (!have_atom) 364 FAIL("no atom"); 365 return (sub); 366 367 error: 368 #if VALREGEX_REPORT_REASON 369 fprintf(stderr, "%s\n", reason); 370 #endif 371 return (-1); 372 } 373