xref: /minix3/external/bsd/bind/dist/lib/isc/regex.c (revision 00b67f09dd46474d133c95011a48590a8e8f94c7)
1 /*	$NetBSD: regex.c,v 1.3 2014/12/10 04:37:59 christos Exp $	*/
2 
3 /*
4  * Copyright (C) 2013, 2014  Internet Systems Consortium, Inc. ("ISC")
5  *
6  * Permission to use, copy, modify, and/or distribute this software for any
7  * purpose with or without fee is hereby granted, provided that the above
8  * copyright notice and this permission notice appear in all copies.
9  *
10  * THE SOFTWARE IS PROVIDED "AS IS" AND ISC DISCLAIMS ALL WARRANTIES WITH
11  * REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY
12  * AND FITNESS.  IN NO EVENT SHALL ISC BE LIABLE FOR ANY SPECIAL, DIRECT,
13  * INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES WHATSOEVER RESULTING FROM
14  * LOSS OF USE, DATA OR PROFITS, WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE
15  * OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR
16  * PERFORMANCE OF THIS SOFTWARE.
17  */
18 
19 #include <config.h>
20 
21 #include <isc/file.h>
22 #include <isc/regex.h>
23 #include <isc/string.h>
24 
25 #if VALREGEX_REPORT_REASON
26 #define FAIL(x) do { reason = (x); goto error; } while(/*CONSTCOND*/0)
27 #else
28 #define FAIL(x) goto error
29 #endif
30 
31 /*
32  * Validate the regular expression 'C' locale.
33  */
34 int
isc_regex_validate(const char * c)35 isc_regex_validate(const char *c) {
36 	enum {
37 		none, parse_bracket, parse_bound,
38 		parse_ce, parse_ec, parse_cc
39 	} state = none;
40 	/* Well known character classes. */
41 	const char *cc[] = {
42 		":alnum:", ":digit:", ":punct:", ":alpha:", ":graph:",
43 		":space:", ":blank:", ":lower:", ":upper:", ":cntrl:",
44 		":print:", ":xdigit:"
45 	};
46 	isc_boolean_t seen_comma = ISC_FALSE;
47 	isc_boolean_t seen_high = ISC_FALSE;
48 	isc_boolean_t seen_char = ISC_FALSE;
49 	isc_boolean_t seen_ec = ISC_FALSE;
50 	isc_boolean_t seen_ce = ISC_FALSE;
51 	isc_boolean_t have_atom = ISC_FALSE;
52 	int group = 0;
53 	int range = 0;
54 	int sub = 0;
55 	isc_boolean_t empty_ok = ISC_FALSE;
56 	isc_boolean_t neg = ISC_FALSE;
57 	isc_boolean_t was_multiple = ISC_FALSE;
58 	unsigned int low = 0;
59 	unsigned int high = 0;
60 	const char *ccname = NULL;
61 	int range_start = 0;
62 #if VALREGEX_REPORT_REASON
63 	const char *reason = "";
64 #endif
65 
66 	if (c == NULL || *c == 0)
67 		FAIL("empty string");
68 
69 	while (c != NULL && *c != 0) {
70 		switch (state) {
71 		case none:
72 			switch (*c) {
73 			case '\\':	/* make literal */
74 				++c;
75 				switch (*c) {
76 				case '1': case '2': case '3':
77 				case '4': case '5': case '6':
78 				case '7': case '8': case '9':
79 					if ((*c - '0') > sub)
80 						FAIL("bad back reference");
81 					have_atom = ISC_TRUE;
82 					was_multiple = ISC_FALSE;
83 					break;
84 				case 0:
85 					FAIL("escaped end-of-string");
86 				default:
87 					goto literal;
88 				}
89 				++c;
90 				break;
91 			case '[':	/* bracket start */
92 				++c;
93 				neg = ISC_FALSE;
94 				was_multiple = ISC_FALSE;
95 				seen_char = ISC_FALSE;
96 				state = parse_bracket;
97 				break;
98 			case '{': 	/* bound start */
99 				switch (c[1]) {
100 				case '0': case '1': case '2': case '3':
101 				case '4': case '5': case '6': case '7':
102 				case '8': case '9':
103 					if (!have_atom)
104 						FAIL("no atom");
105 					if (was_multiple)
106 						FAIL("was multiple");
107 					seen_comma = ISC_FALSE;
108 					seen_high = ISC_FALSE;
109 					low = high = 0;
110 					state = parse_bound;
111 					break;
112 				default:
113 					goto literal;
114 				}
115 				++c;
116 				have_atom = ISC_TRUE;
117 				was_multiple = ISC_TRUE;
118 				break;
119 			case '}':
120 				goto literal;
121 			case '(':	/* group start */
122 				have_atom = ISC_FALSE;
123 				was_multiple = ISC_FALSE;
124 				empty_ok = ISC_TRUE;
125 				++group;
126 				++sub;
127 				++c;
128 				break;
129 			case ')':	/* group end */
130 				if (group && !have_atom && !empty_ok)
131 					FAIL("empty alternative");
132 				have_atom = ISC_TRUE;
133 				was_multiple = ISC_FALSE;
134 				if (group != 0)
135 					--group;
136 				++c;
137 				break;
138 			case '|':	/* alternative seperator */
139 				if (!have_atom)
140 					FAIL("no atom");
141 				have_atom = ISC_FALSE;
142 				empty_ok = ISC_FALSE;
143 				was_multiple = ISC_FALSE;
144 				++c;
145 				break;
146 			case '^':
147 			case '$':
148 				have_atom = ISC_TRUE;
149 				was_multiple = ISC_TRUE;
150 				++c;
151 				break;
152 			case '+':
153 			case '*':
154 			case '?':
155 				if (was_multiple)
156 					FAIL("was multiple");
157 				if (!have_atom)
158 					FAIL("no atom");
159 				have_atom = ISC_TRUE;
160 				was_multiple = ISC_TRUE;
161 				++c;
162 				break;
163 			case '.':
164 			default:
165 			literal:
166 				have_atom = ISC_TRUE;
167 				was_multiple = ISC_FALSE;
168 				++c;
169 				break;
170 			}
171 			break;
172 		case parse_bound:
173 			switch (*c) {
174 			case '0': case '1': case '2': case '3': case '4':
175 			case '5': case '6': case '7': case '8': case '9':
176 				if (!seen_comma) {
177 					low = low * 10 + *c - '0';
178 					if (low > 255)
179 						FAIL("lower bound too big");
180 				} else {
181 					seen_high = ISC_TRUE;
182 					high = high * 10 + *c - '0';
183 					if (high > 255)
184 						FAIL("upper bound too big");
185 				}
186 				++c;
187 				break;
188 			case ',':
189 				if (seen_comma)
190 					FAIL("multiple commas");
191 				seen_comma = ISC_TRUE;
192 				++c;
193 				break;
194 			default:
195 			case '{':
196 				FAIL("non digit/comma");
197 			case '}':
198 				if (seen_high && low > high)
199 					FAIL("bad parse bound");
200 				seen_comma = ISC_FALSE;
201 				state = none;
202 				++c;
203 				break;
204 			}
205 			break;
206 		case parse_bracket:
207 			switch (*c) {
208 			case '^':
209 				if (seen_char || neg) goto inside;
210 				neg = ISC_TRUE;
211 				++c;
212 				break;
213 			case '-':
214 				if (range == 2) goto inside;
215 				if (!seen_char) goto inside;
216 				if (range == 1)
217 					FAIL("bad range");
218 				range = 2;
219 				++c;
220 				break;
221 			case '[':
222 				++c;
223 				switch (*c) {
224 				case '.':	/* collating element */
225 					if (range != 0) --range;
226 					++c;
227 					state = parse_ce;
228 					seen_ce = ISC_FALSE;
229 					break;
230 				case '=':	/* equivalence class */
231 					if (range == 2)
232 					    FAIL("equivalence class in range");
233 					++c;
234 					state = parse_ec;
235 					seen_ec = ISC_FALSE;
236 					break;
237 				case ':':	/* character class */
238 					if (range == 2)
239 					      FAIL("character class in range");
240 					ccname = c;
241 					++c;
242 					state = parse_cc;
243 					break;
244 				}
245 				seen_char = ISC_TRUE;
246 				break;
247 			case ']':
248 				if (!c[1] && !seen_char)
249 					FAIL("unfinished brace");
250 				if (!seen_char)
251 					goto inside;
252 				++c;
253 				range = 0;
254 				have_atom = ISC_TRUE;
255 				state = none;
256 				break;
257 			default:
258 			inside:
259 				seen_char = ISC_TRUE;
260 				if (range == 2 && (*c & 0xff) < range_start)
261 					FAIL("out of order range");
262 				if (range != 0)
263 					--range;
264 				range_start = *c & 0xff;
265 				++c;
266 				break;
267 			};
268 			break;
269 		case parse_ce:
270 			switch (*c) {
271 			case '.':
272 				++c;
273 				switch (*c) {
274 				case ']':
275 					if (!seen_ce)
276 						 FAIL("empty ce");
277 					++c;
278 					state = parse_bracket;
279 					break;
280 				default:
281 					if (seen_ce)
282 						range_start = 256;
283 					else
284 						range_start = '.';
285 					seen_ce = ISC_TRUE;
286 					break;
287 				}
288 				break;
289 			default:
290 				if (seen_ce)
291 					range_start = 256;
292 				else
293 					range_start = *c;
294 				seen_ce = ISC_TRUE;
295 				++c;
296 				break;
297 			}
298 			break;
299 		case parse_ec:
300 			switch (*c) {
301 			case '=':
302 				++c;
303 				switch (*c) {
304 				case ']':
305 					if (!seen_ec)
306 						FAIL("no ec");
307 					++c;
308 					state = parse_bracket;
309 					break;
310 				default:
311 					seen_ec = ISC_TRUE;
312 					break;
313 				}
314 				break;
315 			default:
316 				seen_ec = ISC_TRUE;
317 				++c;
318 				break;
319 			}
320 			break;
321 		case parse_cc:
322 			switch (*c) {
323 			case ':':
324 				++c;
325 				switch (*c) {
326 				case ']': {
327 					unsigned int i;
328 					isc_boolean_t found = ISC_FALSE;
329 					for (i = 0;
330 					     i < sizeof(cc)/sizeof(*cc);
331 					     i++)
332 					{
333 						unsigned int len;
334 						len = strlen(cc[i]);
335 						if (len !=
336 						    (unsigned int)(c - ccname))
337 							continue;
338 						if (strncmp(cc[i], ccname, len))
339 							continue;
340 						found = ISC_TRUE;
341 					}
342 					if (!found)
343 						FAIL("unknown cc");
344 					++c;
345 					state = parse_bracket;
346 					break;
347 					}
348 				default:
349 					break;
350 				}
351 				break;
352 			default:
353 				++c;
354 				break;
355 			}
356 			break;
357 		}
358 	}
359 	if (group != 0)
360 		FAIL("group open");
361 	if (state != none)
362 		FAIL("incomplete");
363 	if (!have_atom)
364 		FAIL("no atom");
365 	return (sub);
366 
367  error:
368 #if VALREGEX_REPORT_REASON
369 	fprintf(stderr, "%s\n", reason);
370 #endif
371 	return (-1);
372 }
373