xref: /openbsd-src/usr.bin/dig/lib/isc/regex.c (revision 479c151d3429b7cfa6228ee428d945620629789d)
15185a700Sflorian /*
25185a700Sflorian  * Copyright (C) Internet Systems Consortium, Inc. ("ISC")
35185a700Sflorian  *
45185a700Sflorian  * Permission to use, copy, modify, and/or distribute this software for any
55185a700Sflorian  * purpose with or without fee is hereby granted, provided that the above
65185a700Sflorian  * copyright notice and this permission notice appear in all copies.
75185a700Sflorian  *
85185a700Sflorian  * THE SOFTWARE IS PROVIDED "AS IS" AND ISC DISCLAIMS ALL WARRANTIES WITH
95185a700Sflorian  * REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY
105185a700Sflorian  * AND FITNESS.  IN NO EVENT SHALL ISC BE LIABLE FOR ANY SPECIAL, DIRECT,
115185a700Sflorian  * INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES WHATSOEVER RESULTING FROM
125185a700Sflorian  * LOSS OF USE, DATA OR PROFITS, WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE
135185a700Sflorian  * OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR
145185a700Sflorian  * PERFORMANCE OF THIS SOFTWARE.
155185a700Sflorian  */
165185a700Sflorian 
175185a700Sflorian #include <isc/regex.h>
18c6d1a7a6Sjsg #include <isc/types.h>
195185a700Sflorian #include <string.h>
205185a700Sflorian 
215185a700Sflorian /*
225185a700Sflorian  * Validate the regular expression 'C' locale.
235185a700Sflorian  */
245185a700Sflorian int
255185a700Sflorian isc_regex_validate(const char *c) {
265185a700Sflorian 	enum {
275185a700Sflorian 		none, parse_bracket, parse_bound,
285185a700Sflorian 		parse_ce, parse_ec, parse_cc
295185a700Sflorian 	} state = none;
305185a700Sflorian 	/* Well known character classes. */
315185a700Sflorian 	const char *cc[] = {
325185a700Sflorian 		":alnum:", ":digit:", ":punct:", ":alpha:", ":graph:",
335185a700Sflorian 		":space:", ":blank:", ":lower:", ":upper:", ":cntrl:",
345185a700Sflorian 		":print:", ":xdigit:"
355185a700Sflorian 	};
361fb015a8Sflorian 	int seen_comma = 0;
371fb015a8Sflorian 	int seen_high = 0;
381fb015a8Sflorian 	int seen_char = 0;
391fb015a8Sflorian 	int seen_ec = 0;
401fb015a8Sflorian 	int seen_ce = 0;
411fb015a8Sflorian 	int have_atom = 0;
425185a700Sflorian 	int group = 0;
435185a700Sflorian 	int range = 0;
445185a700Sflorian 	int sub = 0;
451fb015a8Sflorian 	int empty_ok = 0;
461fb015a8Sflorian 	int neg = 0;
471fb015a8Sflorian 	int was_multiple = 0;
485185a700Sflorian 	unsigned int low = 0;
495185a700Sflorian 	unsigned int high = 0;
505185a700Sflorian 	const char *ccname = NULL;
515185a700Sflorian 	int range_start = 0;
525185a700Sflorian 
535185a700Sflorian 	if (c == NULL || *c == 0)
54d28dedc0Sflorian 		return(-1);
555185a700Sflorian 
565185a700Sflorian 	while (c != NULL && *c != 0) {
575185a700Sflorian 		switch (state) {
585185a700Sflorian 		case none:
595185a700Sflorian 			switch (*c) {
605185a700Sflorian 			case '\\':	/* make literal */
615185a700Sflorian 				++c;
625185a700Sflorian 				switch (*c) {
635185a700Sflorian 				case '1': case '2': case '3':
645185a700Sflorian 				case '4': case '5': case '6':
655185a700Sflorian 				case '7': case '8': case '9':
665185a700Sflorian 					if ((*c - '0') > sub)
67d28dedc0Sflorian 						return(-1);
681fb015a8Sflorian 					have_atom = 1;
691fb015a8Sflorian 					was_multiple = 0;
705185a700Sflorian 					break;
715185a700Sflorian 				case 0:
72d28dedc0Sflorian 					return(-1);
735185a700Sflorian 				default:
745185a700Sflorian 					goto literal;
755185a700Sflorian 				}
765185a700Sflorian 				++c;
775185a700Sflorian 				break;
785185a700Sflorian 			case '[':	/* bracket start */
795185a700Sflorian 				++c;
801fb015a8Sflorian 				neg = 0;
811fb015a8Sflorian 				was_multiple = 0;
821fb015a8Sflorian 				seen_char = 0;
835185a700Sflorian 				state = parse_bracket;
845185a700Sflorian 				break;
855185a700Sflorian 			case '{': 	/* bound start */
865185a700Sflorian 				switch (c[1]) {
875185a700Sflorian 				case '0': case '1': case '2': case '3':
885185a700Sflorian 				case '4': case '5': case '6': case '7':
895185a700Sflorian 				case '8': case '9':
905185a700Sflorian 					if (!have_atom)
91d28dedc0Sflorian 						return(-1);
925185a700Sflorian 					if (was_multiple)
93d28dedc0Sflorian 						return(-1);
941fb015a8Sflorian 					seen_comma = 0;
951fb015a8Sflorian 					seen_high = 0;
965185a700Sflorian 					low = high = 0;
975185a700Sflorian 					state = parse_bound;
985185a700Sflorian 					break;
995185a700Sflorian 				default:
1005185a700Sflorian 					goto literal;
1015185a700Sflorian 				}
1025185a700Sflorian 				++c;
1031fb015a8Sflorian 				have_atom = 1;
1041fb015a8Sflorian 				was_multiple = 1;
1055185a700Sflorian 				break;
1065185a700Sflorian 			case '}':
1075185a700Sflorian 				goto literal;
1085185a700Sflorian 			case '(':	/* group start */
1091fb015a8Sflorian 				have_atom = 0;
1101fb015a8Sflorian 				was_multiple = 0;
1111fb015a8Sflorian 				empty_ok = 1;
1125185a700Sflorian 				++group;
1135185a700Sflorian 				++sub;
1145185a700Sflorian 				++c;
1155185a700Sflorian 				break;
1165185a700Sflorian 			case ')':	/* group end */
1175185a700Sflorian 				if (group && !have_atom && !empty_ok)
118d28dedc0Sflorian 					return(-1);
1191fb015a8Sflorian 				have_atom = 1;
1201fb015a8Sflorian 				was_multiple = 0;
1215185a700Sflorian 				if (group != 0)
1225185a700Sflorian 					--group;
1235185a700Sflorian 				++c;
1245185a700Sflorian 				break;
125b73bdc82Sjmc 			case '|':	/* alternative separator */
1265185a700Sflorian 				if (!have_atom)
127d28dedc0Sflorian 					return(-1);
1281fb015a8Sflorian 				have_atom = 0;
1291fb015a8Sflorian 				empty_ok = 0;
1301fb015a8Sflorian 				was_multiple = 0;
1315185a700Sflorian 				++c;
1325185a700Sflorian 				break;
1335185a700Sflorian 			case '^':
1345185a700Sflorian 			case '$':
1351fb015a8Sflorian 				have_atom = 1;
1361fb015a8Sflorian 				was_multiple = 1;
1375185a700Sflorian 				++c;
1385185a700Sflorian 				break;
1395185a700Sflorian 			case '+':
1405185a700Sflorian 			case '*':
1415185a700Sflorian 			case '?':
1425185a700Sflorian 				if (was_multiple)
143d28dedc0Sflorian 					return(-1);
1445185a700Sflorian 				if (!have_atom)
145d28dedc0Sflorian 					return(-1);
1461fb015a8Sflorian 				have_atom = 1;
1471fb015a8Sflorian 				was_multiple = 1;
1485185a700Sflorian 				++c;
1495185a700Sflorian 				break;
1505185a700Sflorian 			case '.':
1515185a700Sflorian 			default:
1525185a700Sflorian 			literal:
1531fb015a8Sflorian 				have_atom = 1;
1541fb015a8Sflorian 				was_multiple = 0;
1555185a700Sflorian 				++c;
1565185a700Sflorian 				break;
1575185a700Sflorian 			}
1585185a700Sflorian 			break;
1595185a700Sflorian 		case parse_bound:
1605185a700Sflorian 			switch (*c) {
1615185a700Sflorian 			case '0': case '1': case '2': case '3': case '4':
1625185a700Sflorian 			case '5': case '6': case '7': case '8': case '9':
1635185a700Sflorian 				if (!seen_comma) {
1645185a700Sflorian 					low = low * 10 + *c - '0';
1655185a700Sflorian 					if (low > 255)
166d28dedc0Sflorian 						return(-1);
1675185a700Sflorian 				} else {
1681fb015a8Sflorian 					seen_high = 1;
1695185a700Sflorian 					high = high * 10 + *c - '0';
1705185a700Sflorian 					if (high > 255)
171d28dedc0Sflorian 						return(-1);
1725185a700Sflorian 				}
1735185a700Sflorian 				++c;
1745185a700Sflorian 				break;
1755185a700Sflorian 			case ',':
1765185a700Sflorian 				if (seen_comma)
177d28dedc0Sflorian 					return(-1);
1781fb015a8Sflorian 				seen_comma = 1;
1795185a700Sflorian 				++c;
1805185a700Sflorian 				break;
1815185a700Sflorian 			default:
1825185a700Sflorian 			case '{':
183d28dedc0Sflorian 				return(-1);
1845185a700Sflorian 			case '}':
1855185a700Sflorian 				if (seen_high && low > high)
186d28dedc0Sflorian 					return(-1);
1871fb015a8Sflorian 				seen_comma = 0;
1885185a700Sflorian 				state = none;
1895185a700Sflorian 				++c;
1905185a700Sflorian 				break;
1915185a700Sflorian 			}
1925185a700Sflorian 			break;
1935185a700Sflorian 		case parse_bracket:
1945185a700Sflorian 			switch (*c) {
1955185a700Sflorian 			case '^':
1965185a700Sflorian 				if (seen_char || neg) goto inside;
1971fb015a8Sflorian 				neg = 1;
1985185a700Sflorian 				++c;
1995185a700Sflorian 				break;
2005185a700Sflorian 			case '-':
2015185a700Sflorian 				if (range == 2) goto inside;
2025185a700Sflorian 				if (!seen_char) goto inside;
2035185a700Sflorian 				if (range == 1)
204d28dedc0Sflorian 					return(-1);
2055185a700Sflorian 				range = 2;
2065185a700Sflorian 				++c;
2075185a700Sflorian 				break;
2085185a700Sflorian 			case '[':
2095185a700Sflorian 				++c;
2105185a700Sflorian 				switch (*c) {
2115185a700Sflorian 				case '.':	/* collating element */
2125185a700Sflorian 					if (range != 0) --range;
2135185a700Sflorian 					++c;
2145185a700Sflorian 					state = parse_ce;
2151fb015a8Sflorian 					seen_ce = 0;
2165185a700Sflorian 					break;
2175185a700Sflorian 				case '=':	/* equivalence class */
2185185a700Sflorian 					if (range == 2)
219d28dedc0Sflorian 						return(-1);
2205185a700Sflorian 					++c;
2215185a700Sflorian 					state = parse_ec;
2221fb015a8Sflorian 					seen_ec = 0;
2235185a700Sflorian 					break;
2245185a700Sflorian 				case ':':	/* character class */
2255185a700Sflorian 					if (range == 2)
226d28dedc0Sflorian 						return(-1);
2275185a700Sflorian 					ccname = c;
2285185a700Sflorian 					++c;
2295185a700Sflorian 					state = parse_cc;
2305185a700Sflorian 					break;
2315185a700Sflorian 				}
2321fb015a8Sflorian 				seen_char = 1;
2335185a700Sflorian 				break;
2345185a700Sflorian 			case ']':
2355185a700Sflorian 				if (!c[1] && !seen_char)
236d28dedc0Sflorian 					return(-1);
2375185a700Sflorian 				if (!seen_char)
2385185a700Sflorian 					goto inside;
2395185a700Sflorian 				++c;
2405185a700Sflorian 				range = 0;
2411fb015a8Sflorian 				have_atom = 1;
2425185a700Sflorian 				state = none;
2435185a700Sflorian 				break;
2445185a700Sflorian 			default:
2455185a700Sflorian 			inside:
2461fb015a8Sflorian 				seen_char = 1;
2475185a700Sflorian 				if (range == 2 && (*c & 0xff) < range_start)
248d28dedc0Sflorian 					return(-1);
2495185a700Sflorian 				if (range != 0)
2505185a700Sflorian 					--range;
2515185a700Sflorian 				range_start = *c & 0xff;
2525185a700Sflorian 				++c;
2535185a700Sflorian 				break;
254*479c151dSjsg 			}
2555185a700Sflorian 			break;
2565185a700Sflorian 		case parse_ce:
2575185a700Sflorian 			switch (*c) {
2585185a700Sflorian 			case '.':
2595185a700Sflorian 				++c;
2605185a700Sflorian 				switch (*c) {
2615185a700Sflorian 				case ']':
2625185a700Sflorian 					if (!seen_ce)
263d28dedc0Sflorian 						return(-1);
2645185a700Sflorian 					++c;
2655185a700Sflorian 					state = parse_bracket;
2665185a700Sflorian 					break;
2675185a700Sflorian 				default:
2685185a700Sflorian 					if (seen_ce)
2695185a700Sflorian 						range_start = 256;
2705185a700Sflorian 					else
2715185a700Sflorian 						range_start = '.';
2721fb015a8Sflorian 					seen_ce = 1;
2735185a700Sflorian 					break;
2745185a700Sflorian 				}
2755185a700Sflorian 				break;
2765185a700Sflorian 			default:
2775185a700Sflorian 				if (seen_ce)
2785185a700Sflorian 					range_start = 256;
2795185a700Sflorian 				else
2805185a700Sflorian 					range_start = *c;
2811fb015a8Sflorian 				seen_ce = 1;
2825185a700Sflorian 				++c;
2835185a700Sflorian 				break;
2845185a700Sflorian 			}
2855185a700Sflorian 			break;
2865185a700Sflorian 		case parse_ec:
2875185a700Sflorian 			switch (*c) {
2885185a700Sflorian 			case '=':
2895185a700Sflorian 				++c;
2905185a700Sflorian 				switch (*c) {
2915185a700Sflorian 				case ']':
2925185a700Sflorian 					if (!seen_ec)
293d28dedc0Sflorian 						return(-1);
2945185a700Sflorian 					++c;
2955185a700Sflorian 					state = parse_bracket;
2965185a700Sflorian 					break;
2975185a700Sflorian 				default:
2981fb015a8Sflorian 					seen_ec = 1;
2995185a700Sflorian 					break;
3005185a700Sflorian 				}
3015185a700Sflorian 				break;
3025185a700Sflorian 			default:
3031fb015a8Sflorian 				seen_ec = 1;
3045185a700Sflorian 				++c;
3055185a700Sflorian 				break;
3065185a700Sflorian 			}
3075185a700Sflorian 			break;
3085185a700Sflorian 		case parse_cc:
3095185a700Sflorian 			switch (*c) {
3105185a700Sflorian 			case ':':
3115185a700Sflorian 				++c;
3125185a700Sflorian 				switch (*c) {
3135185a700Sflorian 				case ']': {
3145185a700Sflorian 					unsigned int i;
3151fb015a8Sflorian 					int found = 0;
3165185a700Sflorian 					for (i = 0;
3175185a700Sflorian 					     i < sizeof(cc)/sizeof(*cc);
3185185a700Sflorian 					     i++)
3195185a700Sflorian 					{
3205185a700Sflorian 						unsigned int len;
3215185a700Sflorian 						len = strlen(cc[i]);
3225185a700Sflorian 						if (len !=
3235185a700Sflorian 						    (unsigned int)(c - ccname))
3245185a700Sflorian 							continue;
3255185a700Sflorian 						if (strncmp(cc[i], ccname, len))
3265185a700Sflorian 							continue;
3271fb015a8Sflorian 						found = 1;
3285185a700Sflorian 					}
3295185a700Sflorian 					if (!found)
330d28dedc0Sflorian 						return(-1);
3315185a700Sflorian 					++c;
3325185a700Sflorian 					state = parse_bracket;
3335185a700Sflorian 					break;
3345185a700Sflorian 					}
3355185a700Sflorian 				default:
3365185a700Sflorian 					break;
3375185a700Sflorian 				}
3385185a700Sflorian 				break;
3395185a700Sflorian 			default:
3405185a700Sflorian 				++c;
3415185a700Sflorian 				break;
3425185a700Sflorian 			}
3435185a700Sflorian 			break;
3445185a700Sflorian 		}
3455185a700Sflorian 	}
3465185a700Sflorian 	if (group != 0)
3475185a700Sflorian 		return(-1);
348d28dedc0Sflorian 	if (state != none)
349d28dedc0Sflorian 		return(-1);
350d28dedc0Sflorian 	if (!have_atom)
351d28dedc0Sflorian 		return(-1);
352d28dedc0Sflorian 	return (sub);
3535185a700Sflorian }
354