1 /* $NetBSD: regex.c,v 1.1 2024/02/18 20:57:50 christos Exp $ */
2
3 /*
4 * Copyright (C) Internet Systems Consortium, Inc. ("ISC")
5 *
6 * SPDX-License-Identifier: MPL-2.0
7 *
8 * This Source Code Form is subject to the terms of the Mozilla Public
9 * License, v. 2.0. If a copy of the MPL was not distributed with this
10 * file, you can obtain one at https://mozilla.org/MPL/2.0/.
11 *
12 * See the COPYRIGHT file distributed with this work for additional
13 * information regarding copyright ownership.
14 */
15
16 #include <stdbool.h>
17
18 #include <isc/file.h>
19 #include <isc/print.h>
20 #include <isc/regex.h>
21 #include <isc/string.h>
22
23 #if VALREGEX_REPORT_REASON
24 #define FAIL(x) \
25 do { \
26 reason = (x); \
27 goto error; \
28 } while (0)
29 #else /* if VALREGEX_REPORT_REASON */
30 #define FAIL(x) goto error
31 #endif /* if VALREGEX_REPORT_REASON */
32
33 /*
34 * Validate the regular expression 'C' locale.
35 */
36 int
isc_regex_validate(const char * c)37 isc_regex_validate(const char *c) {
38 enum {
39 none,
40 parse_bracket,
41 parse_bound,
42 parse_ce,
43 parse_ec,
44 parse_cc
45 } state = none;
46 /* Well known character classes. */
47 const char *cc[] = { ":alnum:", ":digit:", ":punct:", ":alpha:",
48 ":graph:", ":space:", ":blank:", ":lower:",
49 ":upper:", ":cntrl:", ":print:", ":xdigit:" };
50 bool seen_comma = false;
51 bool seen_high = false;
52 bool seen_char = false;
53 bool seen_ec = false;
54 bool seen_ce = false;
55 bool have_atom = false;
56 int group = 0;
57 int range = 0;
58 int sub = 0;
59 bool empty_ok = false;
60 bool neg = false;
61 bool was_multiple = false;
62 unsigned int low = 0;
63 unsigned int high = 0;
64 const char *ccname = NULL;
65 int range_start = 0;
66 #if VALREGEX_REPORT_REASON
67 const char *reason = "";
68 #endif /* if VALREGEX_REPORT_REASON */
69
70 if (c == NULL || *c == 0) {
71 FAIL("empty string");
72 }
73
74 while (c != NULL && *c != 0) {
75 switch (state) {
76 case none:
77 switch (*c) {
78 case '\\': /* make literal */
79 ++c;
80 switch (*c) {
81 case '1':
82 case '2':
83 case '3':
84 case '4':
85 case '5':
86 case '6':
87 case '7':
88 case '8':
89 case '9':
90 if ((*c - '0') > sub) {
91 FAIL("bad back reference");
92 }
93 have_atom = true;
94 was_multiple = false;
95 break;
96 case 0:
97 FAIL("escaped end-of-string");
98 default:
99 goto literal;
100 }
101 ++c;
102 break;
103 case '[': /* bracket start */
104 ++c;
105 neg = false;
106 was_multiple = false;
107 seen_char = false;
108 state = parse_bracket;
109 break;
110 case '{': /* bound start */
111 switch (c[1]) {
112 case '0':
113 case '1':
114 case '2':
115 case '3':
116 case '4':
117 case '5':
118 case '6':
119 case '7':
120 case '8':
121 case '9':
122 if (!have_atom) {
123 FAIL("no atom");
124 }
125 if (was_multiple) {
126 FAIL("was multiple");
127 }
128 seen_comma = false;
129 seen_high = false;
130 low = high = 0;
131 state = parse_bound;
132 break;
133 default:
134 goto literal;
135 }
136 ++c;
137 have_atom = true;
138 was_multiple = true;
139 break;
140 case '}':
141 goto literal;
142 case '(': /* group start */
143 have_atom = false;
144 was_multiple = false;
145 empty_ok = true;
146 ++group;
147 ++sub;
148 ++c;
149 break;
150 case ')': /* group end */
151 if (group && !have_atom && !empty_ok) {
152 FAIL("empty alternative");
153 }
154 have_atom = true;
155 was_multiple = false;
156 if (group != 0) {
157 --group;
158 }
159 ++c;
160 break;
161 case '|': /* alternative separator */
162 if (!have_atom) {
163 FAIL("no atom");
164 }
165 have_atom = false;
166 empty_ok = false;
167 was_multiple = false;
168 ++c;
169 break;
170 case '^':
171 case '$':
172 have_atom = true;
173 was_multiple = true;
174 ++c;
175 break;
176 case '+':
177 case '*':
178 case '?':
179 if (was_multiple) {
180 FAIL("was multiple");
181 }
182 if (!have_atom) {
183 FAIL("no atom");
184 }
185 have_atom = true;
186 was_multiple = true;
187 ++c;
188 break;
189 case '.':
190 default:
191 literal:
192 have_atom = true;
193 was_multiple = false;
194 ++c;
195 break;
196 }
197 break;
198 case parse_bound:
199 switch (*c) {
200 case '0':
201 case '1':
202 case '2':
203 case '3':
204 case '4':
205 case '5':
206 case '6':
207 case '7':
208 case '8':
209 case '9':
210 if (!seen_comma) {
211 low = low * 10 + *c - '0';
212 if (low > 255) {
213 FAIL("lower bound too big");
214 }
215 } else {
216 seen_high = true;
217 high = high * 10 + *c - '0';
218 if (high > 255) {
219 FAIL("upper bound too big");
220 }
221 }
222 ++c;
223 break;
224 case ',':
225 if (seen_comma) {
226 FAIL("multiple commas");
227 }
228 seen_comma = true;
229 ++c;
230 break;
231 default:
232 case '{':
233 FAIL("non digit/comma");
234 case '}':
235 if (seen_high && low > high) {
236 FAIL("bad parse bound");
237 }
238 seen_comma = false;
239 state = none;
240 ++c;
241 break;
242 }
243 break;
244 case parse_bracket:
245 switch (*c) {
246 case '^':
247 if (seen_char || neg) {
248 goto inside;
249 }
250 neg = true;
251 ++c;
252 break;
253 case '-':
254 if (range == 2) {
255 goto inside;
256 }
257 if (!seen_char) {
258 goto inside;
259 }
260 if (range == 1) {
261 FAIL("bad range");
262 }
263 range = 2;
264 ++c;
265 break;
266 case '[':
267 ++c;
268 switch (*c) {
269 case '.': /* collating element */
270 if (range != 0) {
271 --range;
272 }
273 ++c;
274 state = parse_ce;
275 seen_ce = false;
276 break;
277 case '=': /* equivalence class */
278 if (range == 2) {
279 FAIL("equivalence class in "
280 "range");
281 }
282 ++c;
283 state = parse_ec;
284 seen_ec = false;
285 break;
286 case ':': /* character class */
287 if (range == 2) {
288 FAIL("character class in "
289 "range");
290 }
291 ccname = c;
292 ++c;
293 state = parse_cc;
294 break;
295 }
296 seen_char = true;
297 break;
298 case ']':
299 if (!c[1] && !seen_char) {
300 FAIL("unfinished brace");
301 }
302 if (!seen_char) {
303 goto inside;
304 }
305 ++c;
306 range = 0;
307 have_atom = true;
308 state = none;
309 break;
310 default:
311 inside:
312 seen_char = true;
313 if (range == 2 && (*c & 0xff) < range_start) {
314 FAIL("out of order range");
315 }
316 if (range != 0) {
317 --range;
318 }
319 range_start = *c & 0xff;
320 ++c;
321 break;
322 }
323 break;
324 case parse_ce:
325 switch (*c) {
326 case '.':
327 ++c;
328 switch (*c) {
329 case ']':
330 if (!seen_ce) {
331 FAIL("empty ce");
332 }
333 ++c;
334 state = parse_bracket;
335 break;
336 default:
337 if (seen_ce) {
338 range_start = 256;
339 } else {
340 range_start = '.';
341 }
342 seen_ce = true;
343 break;
344 }
345 break;
346 default:
347 if (seen_ce) {
348 range_start = 256;
349 } else {
350 range_start = *c;
351 }
352 seen_ce = true;
353 ++c;
354 break;
355 }
356 break;
357 case parse_ec:
358 switch (*c) {
359 case '=':
360 ++c;
361 switch (*c) {
362 case ']':
363 if (!seen_ec) {
364 FAIL("no ec");
365 }
366 ++c;
367 state = parse_bracket;
368 break;
369 default:
370 seen_ec = true;
371 break;
372 }
373 break;
374 default:
375 seen_ec = true;
376 ++c;
377 break;
378 }
379 break;
380 case parse_cc:
381 switch (*c) {
382 case ':':
383 ++c;
384 switch (*c) {
385 case ']': {
386 unsigned int i;
387 bool found = false;
388 for (i = 0;
389 i < sizeof(cc) / sizeof(*cc); i++)
390 {
391 unsigned int len;
392 len = strlen(cc[i]);
393 if (len !=
394 (unsigned int)(c - ccname))
395 {
396 continue;
397 }
398 if (strncmp(cc[i], ccname, len))
399 {
400 continue;
401 }
402 found = true;
403 }
404 if (!found) {
405 FAIL("unknown cc");
406 }
407 ++c;
408 state = parse_bracket;
409 break;
410 }
411 default:
412 break;
413 }
414 break;
415 default:
416 ++c;
417 break;
418 }
419 break;
420 }
421 }
422 if (group != 0) {
423 FAIL("group open");
424 }
425 if (state != none) {
426 FAIL("incomplete");
427 }
428 if (!have_atom) {
429 FAIL("no atom");
430 }
431 return (sub);
432
433 error:
434 #if VALREGEX_REPORT_REASON
435 fprintf(stderr, "%s\n", reason);
436 #endif /* if VALREGEX_REPORT_REASON */
437 return (-1);
438 }
439