lib/isc/regex.c

/*	$NetBSD: regex.c,v 1.1 2024/02/18 20:57:50 christos Exp $	*/

/*
 * Copyright (C) Internet Systems Consortium, Inc. ("ISC")
 *
 * SPDX-License-Identifier: MPL-2.0
 *
 * This Source Code Form is subject to the terms of the Mozilla Public
 * License, v. 2.0. If a copy of the MPL was not distributed with this
 * file, you can obtain one at https://mozilla.org/MPL/2.0/.
 *
 * See the COPYRIGHT file distributed with this work for additional
 * information regarding copyright ownership.
 */

#include <stdbool.h>

#include <isc/file.h>
#include <isc/print.h>
#include <isc/regex.h>
#include <isc/string.h>

#if VALREGEX_REPORT_REASON
#define FAIL(x)               \
	do {                  \
		reason = (x); \
		goto error;   \
	} while (0)
#else /* if VALREGEX_REPORT_REASON */
#define FAIL(x) goto error
#endif /* if VALREGEX_REPORT_REASON */

/*
 * Validate the regular expression 'C' locale.
 */
int
isc_regex_validate(const char *c) {
	enum {
		none,
		parse_bracket,
		parse_bound,
		parse_ce,
		parse_ec,
		parse_cc
	} state = none;
	/* Well known character classes. */
	const char *cc[] = { ":alnum:", ":digit:", ":punct:", ":alpha:",
			     ":graph:", ":space:", ":blank:", ":lower:",
			     ":upper:", ":cntrl:", ":print:", ":xdigit:" };
	bool seen_comma = false;
	bool seen_high = false;
	bool seen_char = false;
	bool seen_ec = false;
	bool seen_ce = false;
	bool have_atom = false;
	int group = 0;
	int range = 0;
	int sub = 0;
	bool empty_ok = false;
	bool neg = false;
	bool was_multiple = false;
	unsigned int low = 0;
	unsigned int high = 0;
	const char *ccname = NULL;
	int range_start = 0;
#if VALREGEX_REPORT_REASON
	const char *reason = "";
#endif /* if VALREGEX_REPORT_REASON */

	if (c == NULL || *c == 0) {
		FAIL("empty string");
	}

	while (c != NULL && *c != 0) {
		switch (state) {
		case none:
			switch (*c) {
			case '\\': /* make literal */
				++c;
				switch (*c) {
				case '1':
				case '2':
				case '3':
				case '4':
				case '5':
				case '6':
				case '7':
				case '8':
				case '9':
					if ((*c - '0') > sub) {
						FAIL("bad back reference");
					}
					have_atom = true;
					was_multiple = false;
					break;
				case 0:
					FAIL("escaped end-of-string");
				default:
					goto literal;
				}
				++c;
				break;
			case '[': /* bracket start */
				++c;
				neg = false;
				was_multiple = false;
				seen_char = false;
				state = parse_bracket;
				break;
			case '{': /* bound start */
				switch (c[1]) {
				case '0':
				case '1':
				case '2':
				case '3':
				case '4':
				case '5':
				case '6':
				case '7':
				case '8':
				case '9':
					if (!have_atom) {
						FAIL("no atom");
					}
					if (was_multiple) {
						FAIL("was multiple");
					}
					seen_comma = false;
					seen_high = false;
					low = high = 0;
					state = parse_bound;
					break;
				default:
					goto literal;
				}
				++c;
				have_atom = true;
				was_multiple = true;
				break;
			case '}':
				goto literal;
			case '(': /* group start */
				have_atom = false;
				was_multiple = false;
				empty_ok = true;
				++group;
				++sub;
				++c;
				break;
			case ')': /* group end */
				if (group && !have_atom && !empty_ok) {
					FAIL("empty alternative");
				}
				have_atom = true;
				was_multiple = false;
				if (group != 0) {
					--group;
				}
				++c;
				break;
			case '|': /* alternative separator */
				if (!have_atom) {
					FAIL("no atom");
				}
				have_atom = false;
				empty_ok = false;
				was_multiple = false;
				++c;
				break;
			case '^':
			case '$':
				have_atom = true;
				was_multiple = true;
				++c;
				break;
			case '+':
			case '*':
			case '?':
				if (was_multiple) {
					FAIL("was multiple");
				}
				if (!have_atom) {
					FAIL("no atom");
				}
				have_atom = true;
				was_multiple = true;
				++c;
				break;
			case '.':
			default:
			literal:
				have_atom = true;
				was_multiple = false;
				++c;
				break;
			}
			break;
		case parse_bound:
			switch (*c) {
			case '0':
			case '1':
			case '2':
			case '3':
			case '4':
			case '5':
			case '6':
			case '7':
			case '8':
			case '9':
				if (!seen_comma) {
					low = low * 10 + *c - '0';
					if (low > 255) {
						FAIL("lower bound too big");
					}
				} else {
					seen_high = true;
					high = high * 10 + *c - '0';
					if (high > 255) {
						FAIL("upper bound too big");
					}
				}
				++c;
				break;
			case ',':
				if (seen_comma) {
					FAIL("multiple commas");
				}
				seen_comma = true;
				++c;
				break;
			default:
			case '{':
				FAIL("non digit/comma");
			case '}':
				if (seen_high && low > high) {
					FAIL("bad parse bound");
				}
				seen_comma = false;
				state = none;
				++c;
				break;
			}
			break;
		case parse_bracket:
			switch (*c) {
			case '^':
				if (seen_char || neg) {
					goto inside;
				}
				neg = true;
				++c;
				break;
			case '-':
				if (range == 2) {
					goto inside;
				}
				if (!seen_char) {
					goto inside;
				}
				if (range == 1) {
					FAIL("bad range");
				}
				range = 2;
				++c;
				break;
			case '[':
				++c;
				switch (*c) {
				case '.': /* collating element */
					if (range != 0) {
						--range;
					}
					++c;
					state = parse_ce;
					seen_ce = false;
					break;
				case '=': /* equivalence class */
					if (range == 2) {
						FAIL("equivalence class in "
						     "range");
					}
					++c;
					state = parse_ec;
					seen_ec = false;
					break;
				case ':': /* character class */
					if (range == 2) {
						FAIL("character class in "
						     "range");
					}
					ccname = c;
					++c;
					state = parse_cc;
					break;
				}
				seen_char = true;
				break;
			case ']':
				if (!c[1] && !seen_char) {
					FAIL("unfinished brace");
				}
				if (!seen_char) {
					goto inside;
				}
				++c;
				range = 0;
				have_atom = true;
				state = none;
				break;
			default:
			inside:
				seen_char = true;
				if (range == 2 && (*c & 0xff) < range_start) {
					FAIL("out of order range");
				}
				if (range != 0) {
					--range;
				}
				range_start = *c & 0xff;
				++c;
				break;
			}
			break;
		case parse_ce:
			switch (*c) {
			case '.':
				++c;
				switch (*c) {
				case ']':
					if (!seen_ce) {
						FAIL("empty ce");
					}
					++c;
					state = parse_bracket;
					break;
				default:
					if (seen_ce) {
						range_start = 256;
					} else {
						range_start = '.';
					}
					seen_ce = true;
					break;
				}
				break;
			default:
				if (seen_ce) {
					range_start = 256;
				} else {
					range_start = *c;
				}
				seen_ce = true;
				++c;
				break;
			}
			break;
		case parse_ec:
			switch (*c) {
			case '=':
				++c;
				switch (*c) {
				case ']':
					if (!seen_ec) {
						FAIL("no ec");
					}
					++c;
					state = parse_bracket;
					break;
				default:
					seen_ec = true;
					break;
				}
				break;
			default:
				seen_ec = true;
				++c;
				break;
			}
			break;
		case parse_cc:
			switch (*c) {
			case ':':
				++c;
				switch (*c) {
				case ']': {
					unsigned int i;
					bool found = false;
					for (i = 0;
					     i < sizeof(cc) / sizeof(*cc); i++)
					{
						unsigned int len;
						len = strlen(cc[i]);
						if (len !=
						    (unsigned int)(c - ccname))
						{
							continue;
						}
						if (strncmp(cc[i], ccname, len))
						{
							continue;
						}
						found = true;
					}
					if (!found) {
						FAIL("unknown cc");
					}
					++c;
					state = parse_bracket;
					break;
				}
				default:
					break;
				}
				break;
			default:
				++c;
				break;
			}
			break;
		}
	}
	if (group != 0) {
		FAIL("group open");
	}
	if (state != none) {
		FAIL("incomplete");
	}
	if (!have_atom) {
		FAIL("no atom");
	}
	return (sub);

error:
#if VALREGEX_REPORT_REASON
	fprintf(stderr, "%s\n", reason);
#endif /* if VALREGEX_REPORT_REASON */
	return (-1);
}