1 /* $NetBSD: ure.c,v 1.3 2021/08/14 16:14:57 christos Exp $ */
2
3 /* $OpenLDAP$ */
4 /* This work is part of OpenLDAP Software <http://www.openldap.org/>.
5 *
6 * Copyright 1998-2021 The OpenLDAP Foundation.
7 * All rights reserved.
8 *
9 * Redistribution and use in source and binary forms, with or without
10 * modification, are permitted only as authorized by the OpenLDAP
11 * Public License.
12 *
13 * A copy of this license is available in file LICENSE in the
14 * top-level directory of the distribution or, alternatively, at
15 * <http://www.OpenLDAP.org/license.html>.
16 */
17 /* Copyright 1997, 1998, 1999 Computing Research Labs,
18 * New Mexico State University
19 *
20 * Permission is hereby granted, free of charge, to any person obtaining a
21 * copy of this software and associated documentation files (the "Software"),
22 * to deal in the Software without restriction, including without limitation
23 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
24 * and/or sell copies of the Software, and to permit persons to whom the
25 * Software is furnished to do so, subject to the following conditions:
26 *
27 * The above copyright notice and this permission notice shall be included in
28 * all copies or substantial portions of the Software.
29 *
30 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
31 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
32 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
33 * THE COMPUTING RESEARCH LAB OR NEW MEXICO STATE UNIVERSITY BE LIABLE FOR ANY
34 * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT
35 * OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR
36 * THE USE OR OTHER DEALINGS IN THE SOFTWARE.
37 */
38 /* Id: ure.c,v 1.2 1999/09/21 15:47:43 mleisher Exp " */
39
40 #include <sys/cdefs.h>
41 __RCSID("$NetBSD: ure.c,v 1.3 2021/08/14 16:14:57 christos Exp $");
42
43 #include "portable.h"
44
45 #include <ac/stdlib.h>
46 #include <ac/string.h>
47 #include <ac/unistd.h>
48
49 #include "ure.h"
50
51 /*
52 * Flags used internally in the DFA.
53 */
54 #define _URE_DFA_CASEFOLD 0x01
55 #define _URE_DFA_BLANKLINE 0x02
56
57 static unsigned long cclass_flags[] = {
58 0,
59 _URE_NONSPACING,
60 _URE_COMBINING,
61 _URE_NUMDIGIT,
62 _URE_NUMOTHER,
63 _URE_SPACESEP,
64 _URE_LINESEP,
65 _URE_PARASEP,
66 _URE_CNTRL,
67 _URE_PUA,
68 _URE_UPPER,
69 _URE_LOWER,
70 _URE_TITLE,
71 _URE_MODIFIER,
72 _URE_OTHERLETTER,
73 _URE_DASHPUNCT,
74 _URE_OPENPUNCT,
75 _URE_CLOSEPUNCT,
76 _URE_OTHERPUNCT,
77 _URE_MATHSYM,
78 _URE_CURRENCYSYM,
79 _URE_OTHERSYM,
80 _URE_LTR,
81 _URE_RTL,
82 _URE_EURONUM,
83 _URE_EURONUMSEP,
84 _URE_EURONUMTERM,
85 _URE_ARABNUM,
86 _URE_COMMONSEP,
87 _URE_BLOCKSEP,
88 _URE_SEGMENTSEP,
89 _URE_WHITESPACE,
90 _URE_OTHERNEUT,
91 };
92
93 /*
94 * Symbol types for the DFA.
95 */
96 #define _URE_ANY_CHAR 1
97 #define _URE_CHAR 2
98 #define _URE_CCLASS 3
99 #define _URE_NCCLASS 4
100 #define _URE_BOL_ANCHOR 5
101 #define _URE_EOL_ANCHOR 6
102
103 /*
104 * Op codes for converting the NFA to a DFA.
105 */
106 #define _URE_SYMBOL 10
107 #define _URE_PAREN 11
108 #define _URE_QUEST 12
109 #define _URE_STAR 13
110 #define _URE_PLUS 14
111 #define _URE_ONE 15
112 #define _URE_AND 16
113 #define _URE_OR 17
114
115 #define _URE_NOOP 0xffff
116
117 #define _URE_REGSTART 0x8000
118 #define _URE_REGEND 0x4000
119
120 /*
121 * Structure used to handle a compacted range of characters.
122 */
123 typedef struct {
124 ucs4_t min_code;
125 ucs4_t max_code;
126 } _ure_range_t;
127
128 typedef struct {
129 _ure_range_t *ranges;
130 ucs2_t ranges_used;
131 ucs2_t ranges_size;
132 } _ure_ccl_t;
133
134 typedef union {
135 ucs4_t chr;
136 _ure_ccl_t ccl;
137 } _ure_sym_t;
138
139 /*
140 * This is a general element structure used for expressions and stack
141 * elements.
142 */
143 typedef struct {
144 ucs2_t reg;
145 ucs2_t onstack;
146 ucs2_t type;
147 ucs2_t lhs;
148 ucs2_t rhs;
149 } _ure_elt_t;
150
151 /*
152 * This is a structure used to track a list or a stack of states.
153 */
154 typedef struct {
155 ucs2_t *slist;
156 ucs2_t slist_size;
157 ucs2_t slist_used;
158 } _ure_stlist_t;
159
160 /*
161 * Structure to track the list of unique states for a symbol
162 * during reduction.
163 */
164 typedef struct {
165 ucs2_t id;
166 ucs2_t type;
167 unsigned long mods;
168 unsigned long props;
169 _ure_sym_t sym;
170 _ure_stlist_t states;
171 } _ure_symtab_t;
172
173 /*
174 * Structure to hold a single state.
175 */
176 typedef struct {
177 ucs2_t id;
178 ucs2_t accepting;
179 ucs2_t pad;
180 _ure_stlist_t st;
181 _ure_elt_t *trans;
182 ucs2_t trans_size;
183 ucs2_t trans_used;
184 } _ure_state_t;
185
186 /*
187 * Structure used for keeping lists of states.
188 */
189 typedef struct {
190 _ure_state_t *states;
191 ucs2_t states_size;
192 ucs2_t states_used;
193 } _ure_statetable_t;
194
195 /*
196 * Structure to track pairs of DFA states when equivalent states are
197 * merged.
198 */
199 typedef struct {
200 ucs2_t l;
201 ucs2_t r;
202 } _ure_equiv_t;
203
204 /*
205 * Structure used for constructing the NFA and reducing to a minimal DFA.
206 */
207 typedef struct _ure_buffer_t {
208 int reducing;
209 int error;
210 unsigned long flags;
211
212 _ure_stlist_t stack;
213
214 /*
215 * Table of unique symbols encountered.
216 */
217 _ure_symtab_t *symtab;
218 ucs2_t symtab_size;
219 ucs2_t symtab_used;
220
221 /*
222 * Tracks the unique expressions generated for the NFA and when the NFA is
223 * reduced.
224 */
225 _ure_elt_t *expr;
226 ucs2_t expr_used;
227 ucs2_t expr_size;
228
229 /*
230 * The reduced table of unique groups of NFA states.
231 */
232 _ure_statetable_t states;
233
234 /*
235 * Tracks states when equivalent states are merged.
236 */
237 _ure_equiv_t *equiv;
238 ucs2_t equiv_used;
239 ucs2_t equiv_size;
240 } _ure_buffer_t;
241
242 typedef struct {
243 ucs2_t symbol;
244 ucs2_t next_state;
245 } _ure_trans_t;
246
247 typedef struct {
248 ucs2_t accepting;
249 ucs2_t ntrans;
250 _ure_trans_t *trans;
251 } _ure_dstate_t;
252
253 typedef struct _ure_dfa_t {
254 unsigned long flags;
255
256 _ure_symtab_t *syms;
257 ucs2_t nsyms;
258
259 _ure_dstate_t *states;
260 ucs2_t nstates;
261
262 _ure_trans_t *trans;
263 ucs2_t ntrans;
264 } _ure_dfa_t;
265
266 /*************************************************************************
267 *
268 * Functions.
269 *
270 *************************************************************************/
271
272 static void
_ure_memmove(char * dest,char * src,unsigned long bytes)273 _ure_memmove(char *dest, char *src, unsigned long bytes)
274 {
275 long i, j;
276
277 i = (long) bytes;
278 j = i & 7;
279 i = (i + 7) >> 3;
280
281 /*
282 * Do a memmove using Ye Olde Duff's Device for efficiency.
283 */
284 if (src < dest) {
285 src += bytes;
286 dest += bytes;
287
288 switch (j) {
289 case 0: do {
290 *--dest = *--src;
291 case 7: *--dest = *--src;
292 case 6: *--dest = *--src;
293 case 5: *--dest = *--src;
294 case 4: *--dest = *--src;
295 case 3: *--dest = *--src;
296 case 2: *--dest = *--src;
297 case 1: *--dest = *--src;
298 } while (--i > 0);
299 }
300 } else if (src > dest) {
301 switch (j) {
302 case 0: do {
303 *dest++ = *src++;
304 case 7: *dest++ = *src++;
305 case 6: *dest++ = *src++;
306 case 5: *dest++ = *src++;
307 case 4: *dest++ = *src++;
308 case 3: *dest++ = *src++;
309 case 2: *dest++ = *src++;
310 case 1: *dest++ = *src++;
311 } while (--i > 0);
312 }
313 }
314 }
315
316 static void
_ure_push(ucs2_t v,_ure_buffer_t * b)317 _ure_push(ucs2_t v, _ure_buffer_t *b)
318 {
319 _ure_stlist_t *s;
320
321 if (b == 0)
322 return;
323
324 /*
325 * If the `reducing' parameter is non-zero, check to see if the value
326 * passed is already on the stack.
327 */
328 if (b->reducing != 0 && b->expr[v].onstack != 0)
329 return;
330
331 s = &b->stack;
332 if (s->slist_used == s->slist_size) {
333 if (s->slist_size == 0)
334 s->slist = (ucs2_t *) malloc(sizeof(ucs2_t) << 3);
335 else
336 s->slist = (ucs2_t *) realloc((char *) s->slist,
337 sizeof(ucs2_t) * (s->slist_size + 8));
338 s->slist_size += 8;
339 }
340 s->slist[s->slist_used++] = v;
341
342 /*
343 * If the `reducing' parameter is non-zero, flag the element as being on
344 * the stack.
345 */
346 if (b->reducing != 0)
347 b->expr[v].onstack = 1;
348 }
349
350 static ucs2_t
_ure_peek(_ure_buffer_t * b)351 _ure_peek(_ure_buffer_t *b)
352 {
353 if (b == 0 || b->stack.slist_used == 0)
354 return _URE_NOOP;
355
356 return b->stack.slist[b->stack.slist_used - 1];
357 }
358
359 static ucs2_t
_ure_pop(_ure_buffer_t * b)360 _ure_pop(_ure_buffer_t *b)
361 {
362 ucs2_t v;
363
364 if (b == 0 || b->stack.slist_used == 0)
365 return _URE_NOOP;
366
367 v = b->stack.slist[--b->stack.slist_used];
368 if (b->reducing)
369 b->expr[v].onstack = 0;
370
371 return v;
372 }
373
374 /*************************************************************************
375 *
376 * Start symbol parse functions.
377 *
378 *************************************************************************/
379
380 /*
381 * Parse a comma-separated list of integers that represent character
382 * properties. Combine them into a mask that is returned in the `mask'
383 * variable, and return the number of characters consumed.
384 */
385 static unsigned long
_ure_prop_list(ucs2_t * pp,unsigned long limit,unsigned long * mask,_ure_buffer_t * b)386 _ure_prop_list(ucs2_t *pp, unsigned long limit, unsigned long *mask,
387 _ure_buffer_t *b)
388 {
389 unsigned long n, m;
390 ucs2_t *sp, *ep;
391
392 sp = pp;
393 ep = sp + limit;
394
395 for (m = n = 0; b->error == _URE_OK && sp < ep; sp++) {
396 if (*sp == ',') {
397 /*
398 * Encountered a comma, so select the next character property flag
399 * and reset the number.
400 */
401 m |= cclass_flags[n];
402 n = 0;
403 } else if (*sp >= '0' && *sp <= '9')
404 /*
405 * Encountered a digit, so start or continue building the cardinal
406 * that represents the character property flag.
407 */
408 n = (n * 10) + (*sp - '0');
409 else
410 /*
411 * Encountered something that is not part of the property list.
412 * Indicate that we are done.
413 */
414 break;
415
416 /*
417 * If a property number greater than 32 occurs, then there is a
418 * problem. Most likely a missing comma separator.
419 */
420 if (n > 32)
421 b->error = _URE_INVALID_PROPERTY;
422 }
423
424 if (b->error == _URE_OK && n != 0)
425 m |= cclass_flags[n];
426
427 /*
428 * Set the mask that represents the group of character properties.
429 */
430 *mask = m;
431
432 /*
433 * Return the number of characters consumed.
434 */
435 return sp - pp;
436 }
437
438 /*
439 * Collect a hex number with 1 to 4 digits and return the number
440 * of characters used.
441 */
442 static unsigned long
_ure_hex(ucs2_t * np,unsigned long limit,ucs4_t * n)443 _ure_hex(ucs2_t *np, unsigned long limit, ucs4_t *n)
444 {
445 ucs2_t i;
446 ucs2_t *sp, *ep;
447 ucs4_t nn;
448
449 sp = np;
450 ep = sp + limit;
451
452 for (nn = 0, i = 0; i < 4 && sp < ep; i++, sp++) {
453 if (*sp >= '0' && *sp <= '9')
454 nn = (nn << 4) + (*sp - '0');
455 else if (*sp >= 'A' && *sp <= 'F')
456 nn = (nn << 4) + ((*sp - 'A') + 10);
457 else if (*sp >= 'a' && *sp <= 'f')
458 nn = (nn << 4) + ((*sp - 'a') + 10);
459 else
460 /*
461 * Encountered something that is not a hex digit.
462 */
463 break;
464 }
465
466 /*
467 * Assign the character code collected and return the number of
468 * characters used.
469 */
470 *n = nn;
471
472 return sp - np;
473 }
474
475 /*
476 * Insert a range into a character class, removing duplicates and ordering
477 * them in increasing range-start order.
478 */
479 static void
_ure_add_range(_ure_ccl_t * ccl,_ure_range_t * r,_ure_buffer_t * b)480 _ure_add_range(_ure_ccl_t *ccl, _ure_range_t *r, _ure_buffer_t *b)
481 {
482 ucs2_t i;
483 ucs4_t tmp;
484 _ure_range_t *rp;
485
486 /*
487 * If the `casefold' flag is set, then make sure both endpoints of the
488 * range are converted to lower case.
489 */
490 if (b->flags & _URE_DFA_CASEFOLD) {
491 r->min_code = _ure_tolower(r->min_code);
492 r->max_code = _ure_tolower(r->max_code);
493 }
494
495 /*
496 * Swap the range endpoints if they are not in increasing order.
497 */
498 if (r->min_code > r->max_code) {
499 tmp = r->min_code;
500 r->min_code = r->max_code;
501 r->max_code = tmp;
502 }
503
504 for (i = 0, rp = ccl->ranges;
505 i < ccl->ranges_used && r->min_code < rp->min_code; i++, rp++) ;
506
507 /*
508 * Check for a duplicate.
509 */
510 if (i < ccl->ranges_used &&
511 r->min_code == rp->min_code && r->max_code == rp->max_code)
512 return;
513
514 if (ccl->ranges_used == ccl->ranges_size) {
515 if (ccl->ranges_size == 0)
516 ccl->ranges = (_ure_range_t *) malloc(sizeof(_ure_range_t) << 3);
517 else
518 ccl->ranges = (_ure_range_t *)
519 realloc((char *) ccl->ranges,
520 sizeof(_ure_range_t) * (ccl->ranges_size + 8));
521 ccl->ranges_size += 8;
522 }
523
524 rp = ccl->ranges + ccl->ranges_used;
525
526 if (i < ccl->ranges_used)
527 _ure_memmove((char *) (rp + 1), (char *) rp,
528 sizeof(_ure_range_t) * (ccl->ranges_used - i));
529
530 ccl->ranges_used++;
531 rp->min_code = r->min_code;
532 rp->max_code = r->max_code;
533 }
534
535 #define _URE_ALPHA_MASK (_URE_UPPER|_URE_LOWER|_URE_OTHERLETTER|\
536 _URE_MODIFIER|_URE_TITLE|_URE_NONSPACING|_URE_COMBINING)
537 #define _URE_ALNUM_MASK (_URE_ALPHA_MASK|_URE_NUMDIGIT)
538 #define _URE_PUNCT_MASK (_URE_DASHPUNCT|_URE_OPENPUNCT|_URE_CLOSEPUNCT|\
539 _URE_OTHERPUNCT)
540 #define _URE_GRAPH_MASK (_URE_NUMDIGIT|_URE_NUMOTHER|_URE_ALPHA_MASK|\
541 _URE_MATHSYM|_URE_CURRENCYSYM|_URE_OTHERSYM)
542 #define _URE_PRINT_MASK (_URE_GRAPH_MASK|_URE_SPACESEP)
543 #define _URE_SPACE_MASK (_URE_SPACESEP|_URE_LINESEP|_URE_PARASEP)
544
545 typedef void (*_ure_cclsetup_t)(
546 _ure_symtab_t *sym,
547 unsigned long mask,
548 _ure_buffer_t *b
549 );
550
551 typedef struct {
552 ucs2_t key;
553 unsigned long len;
554 unsigned long next;
555 _ure_cclsetup_t func;
556 unsigned long mask;
557 } _ure_trie_t;
558
559 static void
_ure_ccl_setup(_ure_symtab_t * sym,unsigned long mask,_ure_buffer_t * b)560 _ure_ccl_setup(_ure_symtab_t *sym, unsigned long mask, _ure_buffer_t *b)
561 {
562 sym->props |= mask;
563 }
564
565 static void
_ure_space_setup(_ure_symtab_t * sym,unsigned long mask,_ure_buffer_t * b)566 _ure_space_setup(_ure_symtab_t *sym, unsigned long mask, _ure_buffer_t *b)
567 {
568 _ure_range_t range;
569
570 sym->props |= mask;
571
572 /*
573 * Add the additional characters needed for handling isspace().
574 */
575 range.min_code = range.max_code = '\t';
576 _ure_add_range(&sym->sym.ccl, &range, b);
577 range.min_code = range.max_code = '\r';
578 _ure_add_range(&sym->sym.ccl, &range, b);
579 range.min_code = range.max_code = '\n';
580 _ure_add_range(&sym->sym.ccl, &range, b);
581 range.min_code = range.max_code = '\f';
582 _ure_add_range(&sym->sym.ccl, &range, b);
583 range.min_code = range.max_code = 0xfeff;
584 _ure_add_range(&sym->sym.ccl, &range, b);
585 }
586
587 static void
_ure_xdigit_setup(_ure_symtab_t * sym,unsigned long mask,_ure_buffer_t * b)588 _ure_xdigit_setup(_ure_symtab_t *sym, unsigned long mask, _ure_buffer_t *b)
589 {
590 _ure_range_t range;
591
592 /*
593 * Add the additional characters needed for handling isxdigit().
594 */
595 range.min_code = '0';
596 range.max_code = '9';
597 _ure_add_range(&sym->sym.ccl, &range, b);
598 range.min_code = 'A';
599 range.max_code = 'F';
600 _ure_add_range(&sym->sym.ccl, &range, b);
601 range.min_code = 'a';
602 range.max_code = 'f';
603 _ure_add_range(&sym->sym.ccl, &range, b);
604 }
605
606 static _ure_trie_t cclass_trie[] = {
607 {0x003a, 1, 1, 0, 0},
608 {0x0061, 9, 10, 0, 0},
609 {0x0063, 8, 19, 0, 0},
610 {0x0064, 7, 24, 0, 0},
611 {0x0067, 6, 29, 0, 0},
612 {0x006c, 5, 34, 0, 0},
613 {0x0070, 4, 39, 0, 0},
614 {0x0073, 3, 49, 0, 0},
615 {0x0075, 2, 54, 0, 0},
616 {0x0078, 1, 59, 0, 0},
617 {0x006c, 1, 11, 0, 0},
618 {0x006e, 2, 13, 0, 0},
619 {0x0070, 1, 16, 0, 0},
620 {0x0075, 1, 14, 0, 0},
621 {0x006d, 1, 15, 0, 0},
622 {0x003a, 1, 16, _ure_ccl_setup, _URE_ALNUM_MASK},
623 {0x0068, 1, 17, 0, 0},
624 {0x0061, 1, 18, 0, 0},
625 {0x003a, 1, 19, _ure_ccl_setup, _URE_ALPHA_MASK},
626 {0x006e, 1, 20, 0, 0},
627 {0x0074, 1, 21, 0, 0},
628 {0x0072, 1, 22, 0, 0},
629 {0x006c, 1, 23, 0, 0},
630 {0x003a, 1, 24, _ure_ccl_setup, _URE_CNTRL},
631 {0x0069, 1, 25, 0, 0},
632 {0x0067, 1, 26, 0, 0},
633 {0x0069, 1, 27, 0, 0},
634 {0x0074, 1, 28, 0, 0},
635 {0x003a, 1, 29, _ure_ccl_setup, _URE_NUMDIGIT},
636 {0x0072, 1, 30, 0, 0},
637 {0x0061, 1, 31, 0, 0},
638 {0x0070, 1, 32, 0, 0},
639 {0x0068, 1, 33, 0, 0},
640 {0x003a, 1, 34, _ure_ccl_setup, _URE_GRAPH_MASK},
641 {0x006f, 1, 35, 0, 0},
642 {0x0077, 1, 36, 0, 0},
643 {0x0065, 1, 37, 0, 0},
644 {0x0072, 1, 38, 0, 0},
645 {0x003a, 1, 39, _ure_ccl_setup, _URE_LOWER},
646 {0x0072, 2, 41, 0, 0},
647 {0x0075, 1, 45, 0, 0},
648 {0x0069, 1, 42, 0, 0},
649 {0x006e, 1, 43, 0, 0},
650 {0x0074, 1, 44, 0, 0},
651 {0x003a, 1, 45, _ure_ccl_setup, _URE_PRINT_MASK},
652 {0x006e, 1, 46, 0, 0},
653 {0x0063, 1, 47, 0, 0},
654 {0x0074, 1, 48, 0, 0},
655 {0x003a, 1, 49, _ure_ccl_setup, _URE_PUNCT_MASK},
656 {0x0070, 1, 50, 0, 0},
657 {0x0061, 1, 51, 0, 0},
658 {0x0063, 1, 52, 0, 0},
659 {0x0065, 1, 53, 0, 0},
660 {0x003a, 1, 54, _ure_space_setup, _URE_SPACE_MASK},
661 {0x0070, 1, 55, 0, 0},
662 {0x0070, 1, 56, 0, 0},
663 {0x0065, 1, 57, 0, 0},
664 {0x0072, 1, 58, 0, 0},
665 {0x003a, 1, 59, _ure_ccl_setup, _URE_UPPER},
666 {0x0064, 1, 60, 0, 0},
667 {0x0069, 1, 61, 0, 0},
668 {0x0067, 1, 62, 0, 0},
669 {0x0069, 1, 63, 0, 0},
670 {0x0074, 1, 64, 0, 0},
671 {0x003a, 1, 65, _ure_xdigit_setup, 0},
672 };
673
674 /*
675 * Probe for one of the POSIX colon delimited character classes in the static
676 * trie.
677 */
678 static unsigned long
_ure_posix_ccl(ucs2_t * cp,unsigned long limit,_ure_symtab_t * sym,_ure_buffer_t * b)679 _ure_posix_ccl(ucs2_t *cp, unsigned long limit, _ure_symtab_t *sym,
680 _ure_buffer_t *b)
681 {
682 int i;
683 unsigned long n;
684 _ure_trie_t *tp;
685 ucs2_t *sp, *ep;
686
687 /*
688 * If the number of characters left is less than 7, then this cannot be
689 * interpreted as one of the colon delimited classes.
690 */
691 if (limit < 7)
692 return 0;
693
694 sp = cp;
695 ep = sp + limit;
696 tp = cclass_trie;
697 for (i = 0; sp < ep && i < 8; i++, sp++) {
698 n = tp->len;
699
700 for (; n > 0 && tp->key != *sp; tp++, n--) ;
701
702 if (n == 0)
703 return 0;
704
705 if (*sp == ':' && (i == 6 || i == 7)) {
706 sp++;
707 break;
708 }
709 if (sp + 1 < ep)
710 tp = cclass_trie + tp->next;
711 }
712 if (tp->func == 0)
713 return 0;
714
715 (*tp->func)(sym, tp->mask, b);
716
717 return sp - cp;
718 }
719
720 /*
721 * Construct a list of ranges and return the number of characters consumed.
722 */
723 static unsigned long
_ure_cclass(ucs2_t * cp,unsigned long limit,_ure_symtab_t * symp,_ure_buffer_t * b)724 _ure_cclass(ucs2_t *cp, unsigned long limit, _ure_symtab_t *symp,
725 _ure_buffer_t *b)
726 {
727 int range_end;
728 unsigned long n;
729 ucs2_t *sp, *ep;
730 ucs4_t c, last;
731 _ure_ccl_t *cclp;
732 _ure_range_t range;
733
734 sp = cp;
735 ep = sp + limit;
736
737 if (*sp == '^') {
738 symp->type = _URE_NCCLASS;
739 sp++;
740 } else
741 symp->type = _URE_CCLASS;
742
743 for (last = 0, range_end = 0;
744 b->error == _URE_OK && sp < ep && *sp != ']'; ) {
745 c = *sp++;
746 if (c == '\\') {
747 if (sp == ep) {
748 /*
749 * The EOS was encountered when expecting the reverse solidus
750 * to be followed by the character it is escaping. Set an
751 * error code and return the number of characters consumed up
752 * to this point.
753 */
754 b->error = _URE_UNEXPECTED_EOS;
755 return sp - cp;
756 }
757
758 c = *sp++;
759 switch (c) {
760 case 'a':
761 c = 0x07;
762 break;
763 case 'b':
764 c = 0x08;
765 break;
766 case 'f':
767 c = 0x0c;
768 break;
769 case 'n':
770 c = 0x0a;
771 break;
772 case 'r':
773 c = 0x0d;
774 break;
775 case 't':
776 c = 0x09;
777 break;
778 case 'v':
779 c = 0x0b;
780 break;
781 case 'p':
782 case 'P':
783 sp += _ure_prop_list(sp, ep - sp, &symp->props, b);
784 /*
785 * Invert the bit mask of the properties if this is a negated
786 * character class or if 'P' is used to specify a list of
787 * character properties that should *not* match in a
788 * character class.
789 */
790 if (c == 'P')
791 symp->props = ~symp->props;
792 continue;
793 break;
794 case 'x':
795 case 'X':
796 case 'u':
797 case 'U':
798 if (sp < ep &&
799 ((*sp >= '0' && *sp <= '9') ||
800 (*sp >= 'A' && *sp <= 'F') ||
801 (*sp >= 'a' && *sp <= 'f')))
802 sp += _ure_hex(sp, ep - sp, &c);
803 }
804 } else if (c == ':') {
805 /*
806 * Probe for a POSIX colon delimited character class.
807 */
808 sp--;
809 if ((n = _ure_posix_ccl(sp, ep - sp, symp, b)) == 0)
810 sp++;
811 else {
812 sp += n;
813 continue;
814 }
815 }
816
817 cclp = &symp->sym.ccl;
818
819 /*
820 * Check to see if the current character is a low surrogate that needs
821 * to be combined with a preceding high surrogate.
822 */
823 if (last != 0) {
824 if (c >= 0xdc00 && c <= 0xdfff)
825 /*
826 * Construct the UTF16 character code.
827 */
828 c = 0x10000 + (((last & 0x03ff) << 10) | (c & 0x03ff));
829 else {
830 /*
831 * Add the isolated high surrogate to the range.
832 */
833 if (range_end == 1)
834 range.max_code = last & 0xffff;
835 else
836 range.min_code = range.max_code = last & 0xffff;
837
838 _ure_add_range(cclp, &range, b);
839 range_end = 0;
840 }
841 }
842
843 /*
844 * Clear the last character code.
845 */
846 last = 0;
847
848 /*
849 * This slightly awkward code handles the different cases needed to
850 * construct a range.
851 */
852 if (c >= 0xd800 && c <= 0xdbff) {
853 /*
854 * If the high surrogate is followed by a range indicator, simply
855 * add it as the range start. Otherwise, save it in case the next
856 * character is a low surrogate.
857 */
858 if (*sp == '-') {
859 sp++;
860 range.min_code = c;
861 range_end = 1;
862 } else
863 last = c;
864 } else if (range_end == 1) {
865 range.max_code = c;
866 _ure_add_range(cclp, &range, b);
867 range_end = 0;
868 } else {
869 range.min_code = range.max_code = c;
870 if (*sp == '-') {
871 sp++;
872 range_end = 1;
873 } else
874 _ure_add_range(cclp, &range, b);
875 }
876 }
877
878 if (sp < ep && *sp == ']')
879 sp++;
880 else
881 /*
882 * The parse was not terminated by the character class close symbol
883 * (']'), so set an error code.
884 */
885 b->error = _URE_CCLASS_OPEN;
886
887 return sp - cp;
888 }
889
890 /*
891 * Probe for a low surrogate hex code.
892 */
893 static unsigned long
_ure_probe_ls(ucs2_t * ls,unsigned long limit,ucs4_t * c)894 _ure_probe_ls(ucs2_t *ls, unsigned long limit, ucs4_t *c)
895 {
896 ucs4_t i, code;
897 ucs2_t *sp, *ep;
898
899 for (i = code = 0, sp = ls, ep = sp + limit; i < 4 && sp < ep; sp++) {
900 if (*sp >= '0' && *sp <= '9')
901 code = (code << 4) + (*sp - '0');
902 else if (*sp >= 'A' && *sp <= 'F')
903 code = (code << 4) + ((*sp - 'A') + 10);
904 else if (*sp >= 'a' && *sp <= 'f')
905 code = (code << 4) + ((*sp - 'a') + 10);
906 else
907 break;
908 }
909
910 *c = code;
911 return (0xdc00 <= code && code <= 0xdfff) ? sp - ls : 0;
912 }
913
914 static unsigned long
_ure_compile_symbol(ucs2_t * sym,unsigned long limit,_ure_symtab_t * symp,_ure_buffer_t * b)915 _ure_compile_symbol(ucs2_t *sym, unsigned long limit, _ure_symtab_t *symp,
916 _ure_buffer_t *b)
917 {
918 ucs4_t c;
919 ucs2_t *sp, *ep;
920
921 sp = sym;
922 ep = sym + limit;
923
924 if ((c = *sp++) == '\\') {
925
926 if (sp == ep) {
927 /*
928 * The EOS was encountered when expecting the reverse solidus to
929 * be followed by the character it is escaping. Set an error code
930 * and return the number of characters consumed up to this point.
931 */
932 b->error = _URE_UNEXPECTED_EOS;
933 return sp - sym;
934 }
935
936 c = *sp++;
937 switch (c) {
938 case 'p':
939 case 'P':
940 symp->type = (c == 'p') ? _URE_CCLASS : _URE_NCCLASS;
941 sp += _ure_prop_list(sp, ep - sp, &symp->props, b);
942 break;
943 case 'a':
944 symp->type = _URE_CHAR;
945 symp->sym.chr = 0x07;
946 break;
947 case 'b':
948 symp->type = _URE_CHAR;
949 symp->sym.chr = 0x08;
950 break;
951 case 'f':
952 symp->type = _URE_CHAR;
953 symp->sym.chr = 0x0c;
954 break;
955 case 'n':
956 symp->type = _URE_CHAR;
957 symp->sym.chr = 0x0a;
958 break;
959 case 'r':
960 symp->type = _URE_CHAR;
961 symp->sym.chr = 0x0d;
962 break;
963 case 't':
964 symp->type = _URE_CHAR;
965 symp->sym.chr = 0x09;
966 break;
967 case 'v':
968 symp->type = _URE_CHAR;
969 symp->sym.chr = 0x0b;
970 break;
971 case 'x':
972 case 'X':
973 case 'u':
974 case 'U':
975 /*
976 * Collect between 1 and 4 digits representing a UCS2 code. Fall
977 * through to the next case.
978 */
979 if (sp < ep &&
980 ((*sp >= '0' && *sp <= '9') ||
981 (*sp >= 'A' && *sp <= 'F') ||
982 (*sp >= 'a' && *sp <= 'f')))
983 sp += _ure_hex(sp, ep - sp, &c);
984 /* FALLTHROUGH */
985 default:
986 /*
987 * Simply add an escaped character here.
988 */
989 symp->type = _URE_CHAR;
990 symp->sym.chr = c;
991 }
992 } else if (c == '^' || c == '$')
993 /*
994 * Handle the BOL and EOL anchors. This actually consists simply of
995 * setting a flag that indicates that the user supplied anchor match
996 * function should be called. This needs to be done instead of simply
997 * matching line/paragraph separators because beginning-of-text and
998 * end-of-text tests are needed as well.
999 */
1000 symp->type = (c == '^') ? _URE_BOL_ANCHOR : _URE_EOL_ANCHOR;
1001 else if (c == '[')
1002 /*
1003 * Construct a character class.
1004 */
1005 sp += _ure_cclass(sp, ep - sp, symp, b);
1006 else if (c == '.')
1007 symp->type = _URE_ANY_CHAR;
1008 else {
1009 symp->type = _URE_CHAR;
1010 symp->sym.chr = c;
1011 }
1012
1013 /*
1014 * If the symbol type happens to be a character and is a high surrogate,
1015 * then probe forward to see if it is followed by a low surrogate that
1016 * needs to be added.
1017 */
1018 if (sp < ep && symp->type == _URE_CHAR &&
1019 0xd800 <= symp->sym.chr && symp->sym.chr <= 0xdbff) {
1020
1021 if (0xdc00 <= *sp && *sp <= 0xdfff) {
1022 symp->sym.chr = 0x10000 + (((symp->sym.chr & 0x03ff) << 10) |
1023 (*sp & 0x03ff));
1024 sp++;
1025 } else if (*sp == '\\' && (*(sp + 1) == 'x' || *(sp + 1) == 'X' ||
1026 *(sp + 1) == 'u' || *(sp + 1) == 'U')) {
1027 sp += _ure_probe_ls(sp + 2, ep - (sp + 2), &c);
1028 if (0xdc00 <= c && c <= 0xdfff) {
1029 /*
1030 * Take into account the \[xu] in front of the hex code.
1031 */
1032 sp += 2;
1033 symp->sym.chr = 0x10000 + (((symp->sym.chr & 0x03ff) << 10) |
1034 (c & 0x03ff));
1035 }
1036 }
1037 }
1038
1039 /*
1040 * Last, make sure any _URE_CHAR type symbols are changed to lower case if
1041 * the `casefold' flag is set.
1042 */
1043 if ((b->flags & _URE_DFA_CASEFOLD) && symp->type == _URE_CHAR)
1044 symp->sym.chr = _ure_tolower(symp->sym.chr);
1045
1046 /*
1047 * If the symbol constructed is anything other than one of the anchors,
1048 * make sure the _URE_DFA_BLANKLINE flag is removed.
1049 */
1050 if (symp->type != _URE_BOL_ANCHOR && symp->type != _URE_EOL_ANCHOR)
1051 b->flags &= ~_URE_DFA_BLANKLINE;
1052
1053 /*
1054 * Return the number of characters consumed.
1055 */
1056 return sp - sym;
1057 }
1058
1059 static int
_ure_sym_neq(_ure_symtab_t * a,_ure_symtab_t * b)1060 _ure_sym_neq(_ure_symtab_t *a, _ure_symtab_t *b)
1061 {
1062 if (a->type != b->type || a->mods != b->mods || a->props != b->props)
1063 return 1;
1064
1065 if (a->type == _URE_CCLASS || a->type == _URE_NCCLASS) {
1066 if (a->sym.ccl.ranges_used != b->sym.ccl.ranges_used)
1067 return 1;
1068 if (a->sym.ccl.ranges_used > 0 &&
1069 memcmp((char *) a->sym.ccl.ranges, (char *) b->sym.ccl.ranges,
1070 sizeof(_ure_range_t) * a->sym.ccl.ranges_used) != 0)
1071 return 1;
1072 } else if (a->type == _URE_CHAR && a->sym.chr != b->sym.chr)
1073 return 1;
1074 return 0;
1075 }
1076
1077 /*
1078 * Construct a symbol, but only keep unique symbols.
1079 */
1080 static ucs2_t
_ure_make_symbol(ucs2_t * sym,unsigned long limit,unsigned long * consumed,_ure_buffer_t * b)1081 _ure_make_symbol(ucs2_t *sym, unsigned long limit, unsigned long *consumed,
1082 _ure_buffer_t *b)
1083 {
1084 ucs2_t i;
1085 _ure_symtab_t *sp, symbol;
1086
1087 /*
1088 * Build the next symbol so we can test to see if it is already in the
1089 * symbol table.
1090 */
1091 (void) memset((char *) &symbol, '\0', sizeof(_ure_symtab_t));
1092 *consumed = _ure_compile_symbol(sym, limit, &symbol, b);
1093
1094 /*
1095 * Check to see if the symbol exists.
1096 */
1097 for (i = 0, sp = b->symtab;
1098 i < b->symtab_used && _ure_sym_neq(&symbol, sp); i++, sp++) ;
1099
1100 if (i < b->symtab_used) {
1101 /*
1102 * Free up any ranges used for the symbol.
1103 */
1104 if ((symbol.type == _URE_CCLASS || symbol.type == _URE_NCCLASS) &&
1105 symbol.sym.ccl.ranges_size > 0)
1106 free((char *) symbol.sym.ccl.ranges);
1107
1108 return b->symtab[i].id;
1109 }
1110
1111 /*
1112 * Need to add the new symbol.
1113 */
1114 if (b->symtab_used == b->symtab_size) {
1115 if (b->symtab_size == 0)
1116 b->symtab = (_ure_symtab_t *) malloc(sizeof(_ure_symtab_t) << 3);
1117 else
1118 b->symtab = (_ure_symtab_t *)
1119 realloc((char *) b->symtab,
1120 sizeof(_ure_symtab_t) * (b->symtab_size + 8));
1121 sp = b->symtab + b->symtab_size;
1122 (void) memset((char *) sp, '\0', sizeof(_ure_symtab_t) << 3);
1123 b->symtab_size += 8;
1124 }
1125
1126 symbol.id = b->symtab_used++;
1127 (void) AC_MEMCPY((char *) &b->symtab[symbol.id], (char *) &symbol,
1128 sizeof(_ure_symtab_t));
1129
1130 return symbol.id;
1131 }
1132
1133 /*************************************************************************
1134 *
1135 * End symbol parse functions.
1136 *
1137 *************************************************************************/
1138
1139 static ucs2_t
_ure_make_expr(ucs2_t type,ucs2_t lhs,ucs2_t rhs,_ure_buffer_t * b)1140 _ure_make_expr(ucs2_t type, ucs2_t lhs, ucs2_t rhs, _ure_buffer_t *b)
1141 {
1142 ucs2_t i;
1143
1144 if (b == 0)
1145 return _URE_NOOP;
1146
1147 /*
1148 * Determine if the expression already exists or not.
1149 */
1150 for (i = 0; i < b->expr_used; i++) {
1151 if (b->expr[i].type == type && b->expr[i].lhs == lhs &&
1152 b->expr[i].rhs == rhs)
1153 break;
1154 }
1155 if (i < b->expr_used)
1156 return i;
1157
1158 /*
1159 * Need to add a new expression.
1160 */
1161 if (b->expr_used == b->expr_size) {
1162 if (b->expr_size == 0)
1163 b->expr = (_ure_elt_t *) malloc(sizeof(_ure_elt_t) << 3);
1164 else
1165 b->expr = (_ure_elt_t *)
1166 realloc((char *) b->expr,
1167 sizeof(_ure_elt_t) * (b->expr_size + 8));
1168 b->expr_size += 8;
1169 }
1170
1171 b->expr[b->expr_used].onstack = 0;
1172 b->expr[b->expr_used].type = type;
1173 b->expr[b->expr_used].lhs = lhs;
1174 b->expr[b->expr_used].rhs = rhs;
1175
1176 return b->expr_used++;
1177 }
1178
1179 static unsigned char spmap[] = {
1180 0x00, 0x00, 0x00, 0x00, 0x00, 0x0f, 0x00, 0x80, 0x00, 0x00, 0x00, 0x00,
1181 0x00, 0x00, 0x00, 0x10, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
1182 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
1183 };
1184
1185 #define _ure_isspecial(cc) ((cc) > 0x20 && (cc) < 0x7f && \
1186 (spmap[(cc) >> 3] & (1 << ((cc) & 7))))
1187
1188 /*
1189 * Convert the regular expression into an NFA in a form that will be easy to
1190 * reduce to a DFA. The starting state for the reduction will be returned.
1191 */
1192 static ucs2_t
_ure_re2nfa(ucs2_t * re,unsigned long relen,_ure_buffer_t * b)1193 _ure_re2nfa(ucs2_t *re, unsigned long relen, _ure_buffer_t *b)
1194 {
1195 ucs2_t c, state, top, sym, *sp, *ep;
1196 unsigned long used;
1197
1198 state = _URE_NOOP;
1199
1200 sp = re;
1201 ep = sp + relen;
1202 while (b->error == _URE_OK && sp < ep) {
1203 c = *sp++;
1204 switch (c) {
1205 case '(':
1206 _ure_push(_URE_PAREN, b);
1207 break;
1208 case ')':
1209 /*
1210 * Check for the case of too many close parentheses.
1211 */
1212 if (_ure_peek(b) == _URE_NOOP) {
1213 b->error = _URE_UNBALANCED_GROUP;
1214 break;
1215 }
1216
1217 while ((top = _ure_peek(b)) == _URE_AND || top == _URE_OR)
1218 /*
1219 * Make an expression with the AND or OR operator and its right
1220 * hand side.
1221 */
1222 state = _ure_make_expr(_ure_pop(b), _ure_pop(b), state, b);
1223
1224 /*
1225 * Remove the _URE_PAREN off the stack.
1226 */
1227 (void) _ure_pop(b);
1228 break;
1229 case '*':
1230 state = _ure_make_expr(_URE_STAR, state, _URE_NOOP, b);
1231 break;
1232 case '+':
1233 state = _ure_make_expr(_URE_PLUS, state, _URE_NOOP, b);
1234 break;
1235 case '?':
1236 state = _ure_make_expr(_URE_QUEST, state, _URE_NOOP, b);
1237 break;
1238 case '|':
1239 while ((top = _ure_peek(b)) == _URE_AND || top == _URE_OR)
1240 /*
1241 * Make an expression with the AND or OR operator and its right
1242 * hand side.
1243 */
1244 state = _ure_make_expr(_ure_pop(b), _ure_pop(b), state, b);
1245
1246 _ure_push(state, b);
1247 _ure_push(_URE_OR, b);
1248 break;
1249 default:
1250 sp--;
1251 sym = _ure_make_symbol(sp, ep - sp, &used, b);
1252 sp += used;
1253 state = _ure_make_expr(_URE_SYMBOL, sym, _URE_NOOP, b);
1254 break;
1255 }
1256
1257 if (c != '(' && c != '|' && sp < ep &&
1258 (!_ure_isspecial(*sp) || *sp == '(')) {
1259 _ure_push(state, b);
1260 _ure_push(_URE_AND, b);
1261 }
1262 }
1263 while ((top = _ure_peek(b)) == _URE_AND || top == _URE_OR)
1264 /*
1265 * Make an expression with the AND or OR operator and its right
1266 * hand side.
1267 */
1268 state = _ure_make_expr(_ure_pop(b), _ure_pop(b), state, b);
1269
1270 if (b->stack.slist_used > 0)
1271 b->error = _URE_UNBALANCED_GROUP;
1272
1273 return (b->error == _URE_OK) ? state : _URE_NOOP;
1274 }
1275
1276 static void
_ure_add_symstate(ucs2_t sym,ucs2_t state,_ure_buffer_t * b)1277 _ure_add_symstate(ucs2_t sym, ucs2_t state, _ure_buffer_t *b)
1278 {
1279 ucs2_t i, *stp;
1280 _ure_symtab_t *sp;
1281
1282 /*
1283 * Locate the symbol in the symbol table so the state can be added.
1284 * If the symbol doesn't exist, then a real problem exists.
1285 */
1286 for (i = 0, sp = b->symtab; i < b->symtab_used && sym != sp->id;
1287 i++, sp++) ;
1288
1289 /*
1290 * Now find out if the state exists in the symbol's state list.
1291 */
1292 for (i = 0, stp = sp->states.slist;
1293 i < sp->states.slist_used && state > *stp; i++, stp++) ;
1294
1295 if (i == sp->states.slist_used || state < *stp) {
1296 /*
1297 * Need to add the state in order.
1298 */
1299 if (sp->states.slist_used == sp->states.slist_size) {
1300 if (sp->states.slist_size == 0)
1301 sp->states.slist = (ucs2_t *) malloc(sizeof(ucs2_t) << 3);
1302 else
1303 sp->states.slist = (ucs2_t *)
1304 realloc((char *) sp->states.slist,
1305 sizeof(ucs2_t) * (sp->states.slist_size + 8));
1306 sp->states.slist_size += 8;
1307 }
1308 if (i < sp->states.slist_used)
1309 (void) _ure_memmove((char *) (sp->states.slist + i + 1),
1310 (char *) (sp->states.slist + i),
1311 sizeof(ucs2_t) * (sp->states.slist_used - i));
1312 sp->states.slist[i] = state;
1313 sp->states.slist_used++;
1314 }
1315 }
1316
1317 static ucs2_t
_ure_add_state(ucs2_t nstates,ucs2_t * states,_ure_buffer_t * b)1318 _ure_add_state(ucs2_t nstates, ucs2_t *states, _ure_buffer_t *b)
1319 {
1320 ucs2_t i;
1321 _ure_state_t *sp;
1322
1323 for (i = 0, sp = b->states.states; i < b->states.states_used; i++, sp++) {
1324 if (sp->st.slist_used == nstates &&
1325 memcmp((char *) states, (char *) sp->st.slist,
1326 sizeof(ucs2_t) * nstates) == 0)
1327 break;
1328 }
1329
1330 if (i == b->states.states_used) {
1331 /*
1332 * Need to add a new DFA state (set of NFA states).
1333 */
1334 if (b->states.states_used == b->states.states_size) {
1335 if (b->states.states_size == 0)
1336 b->states.states = (_ure_state_t *)
1337 malloc(sizeof(_ure_state_t) << 3);
1338 else
1339 b->states.states = (_ure_state_t *)
1340 realloc((char *) b->states.states,
1341 sizeof(_ure_state_t) * (b->states.states_size + 8));
1342 sp = b->states.states + b->states.states_size;
1343 (void) memset((char *) sp, '\0', sizeof(_ure_state_t) << 3);
1344 b->states.states_size += 8;
1345 }
1346
1347 sp = b->states.states + b->states.states_used++;
1348 sp->id = i;
1349
1350 if (sp->st.slist_used + nstates > sp->st.slist_size) {
1351 if (sp->st.slist_size == 0)
1352 sp->st.slist = (ucs2_t *)
1353 malloc(sizeof(ucs2_t) * (sp->st.slist_used + nstates));
1354 else
1355 sp->st.slist = (ucs2_t *)
1356 realloc((char *) sp->st.slist,
1357 sizeof(ucs2_t) * (sp->st.slist_used + nstates));
1358 sp->st.slist_size = sp->st.slist_used + nstates;
1359 }
1360 sp->st.slist_used = nstates;
1361 (void) AC_MEMCPY((char *) sp->st.slist, (char *) states,
1362 sizeof(ucs2_t) * nstates);
1363 }
1364
1365 /*
1366 * Return the ID of the DFA state representing a group of NFA states.
1367 */
1368 return i;
1369 }
1370
1371 static void
_ure_reduce(ucs2_t start,_ure_buffer_t * b)1372 _ure_reduce(ucs2_t start, _ure_buffer_t *b)
1373 {
1374 ucs2_t i, j, state, eval, syms, rhs;
1375 ucs2_t s1, s2, ns1, ns2;
1376 _ure_state_t *sp;
1377 _ure_symtab_t *smp;
1378
1379 b->reducing = 1;
1380
1381 /*
1382 * Add the starting state for the reduction.
1383 */
1384 _ure_add_state(1, &start, b);
1385
1386 /*
1387 * Process each set of NFA states that get created.
1388 */
1389 for (i = 0; i < b->states.states_used; i++) {
1390 sp = b->states.states + i;
1391
1392 /*
1393 * Push the current states on the stack.
1394 */
1395 for (j = 0; j < sp->st.slist_used; j++)
1396 _ure_push(sp->st.slist[j], b);
1397
1398 /*
1399 * Reduce the NFA states.
1400 */
1401 for (j = sp->accepting = syms = 0; j < b->stack.slist_used; j++) {
1402 state = b->stack.slist[j];
1403 eval = 1;
1404
1405 /*
1406 * This inner loop is the iterative equivalent of recursively
1407 * reducing subexpressions generated as a result of a reduction.
1408 */
1409 while (eval) {
1410 switch (b->expr[state].type) {
1411 case _URE_SYMBOL:
1412 ns1 = _ure_make_expr(_URE_ONE, _URE_NOOP, _URE_NOOP, b);
1413 _ure_add_symstate(b->expr[state].lhs, ns1, b);
1414 syms++;
1415 eval = 0;
1416 break;
1417 case _URE_ONE:
1418 sp->accepting = 1;
1419 eval = 0;
1420 break;
1421 case _URE_QUEST:
1422 s1 = b->expr[state].lhs;
1423 ns1 = _ure_make_expr(_URE_ONE, _URE_NOOP, _URE_NOOP, b);
1424 state = _ure_make_expr(_URE_OR, ns1, s1, b);
1425 break;
1426 case _URE_PLUS:
1427 s1 = b->expr[state].lhs;
1428 ns1 = _ure_make_expr(_URE_STAR, s1, _URE_NOOP, b);
1429 state = _ure_make_expr(_URE_AND, s1, ns1, b);
1430 break;
1431 case _URE_STAR:
1432 s1 = b->expr[state].lhs;
1433 ns1 = _ure_make_expr(_URE_ONE, _URE_NOOP, _URE_NOOP, b);
1434 ns2 = _ure_make_expr(_URE_PLUS, s1, _URE_NOOP, b);
1435 state = _ure_make_expr(_URE_OR, ns1, ns2, b);
1436 break;
1437 case _URE_OR:
1438 s1 = b->expr[state].lhs;
1439 s2 = b->expr[state].rhs;
1440 _ure_push(s1, b);
1441 _ure_push(s2, b);
1442 eval = 0;
1443 break;
1444 case _URE_AND:
1445 s1 = b->expr[state].lhs;
1446 s2 = b->expr[state].rhs;
1447 switch (b->expr[s1].type) {
1448 case _URE_SYMBOL:
1449 _ure_add_symstate(b->expr[s1].lhs, s2, b);
1450 syms++;
1451 eval = 0;
1452 break;
1453 case _URE_ONE:
1454 state = s2;
1455 break;
1456 case _URE_QUEST:
1457 ns1 = b->expr[s1].lhs;
1458 ns2 = _ure_make_expr(_URE_AND, ns1, s2, b);
1459 state = _ure_make_expr(_URE_OR, s2, ns2, b);
1460 break;
1461 case _URE_PLUS:
1462 ns1 = b->expr[s1].lhs;
1463 ns2 = _ure_make_expr(_URE_OR, s2, state, b);
1464 state = _ure_make_expr(_URE_AND, ns1, ns2, b);
1465 break;
1466 case _URE_STAR:
1467 ns1 = b->expr[s1].lhs;
1468 ns2 = _ure_make_expr(_URE_AND, ns1, state, b);
1469 state = _ure_make_expr(_URE_OR, s2, ns2, b);
1470 break;
1471 case _URE_OR:
1472 ns1 = b->expr[s1].lhs;
1473 ns2 = b->expr[s1].rhs;
1474 ns1 = _ure_make_expr(_URE_AND, ns1, s2, b);
1475 ns2 = _ure_make_expr(_URE_AND, ns2, s2, b);
1476 state = _ure_make_expr(_URE_OR, ns1, ns2, b);
1477 break;
1478 case _URE_AND:
1479 ns1 = b->expr[s1].lhs;
1480 ns2 = b->expr[s1].rhs;
1481 ns2 = _ure_make_expr(_URE_AND, ns2, s2, b);
1482 state = _ure_make_expr(_URE_AND, ns1, ns2, b);
1483 break;
1484 }
1485 }
1486 }
1487 }
1488
1489 /*
1490 * Clear the state stack.
1491 */
1492 while (_ure_pop(b) != _URE_NOOP) ;
1493
1494 /*
1495 * Reset the state pointer because the reduction may have moved it
1496 * during a reallocation.
1497 */
1498 sp = b->states.states + i;
1499
1500 /*
1501 * Generate the DFA states for the symbols collected during the
1502 * current reduction.
1503 */
1504 if (sp->trans_used + syms > sp->trans_size) {
1505 if (sp->trans_size == 0)
1506 sp->trans = (_ure_elt_t *)
1507 malloc(sizeof(_ure_elt_t) * (sp->trans_used + syms));
1508 else
1509 sp->trans = (_ure_elt_t *)
1510 realloc((char *) sp->trans,
1511 sizeof(_ure_elt_t) * (sp->trans_used + syms));
1512 sp->trans_size = sp->trans_used + syms;
1513 }
1514
1515 /*
1516 * Go through the symbol table and generate the DFA state transitions
1517 * for each symbol that has collected NFA states.
1518 */
1519 for (j = syms = 0, smp = b->symtab; j < b->symtab_used; j++, smp++) {
1520 sp = b->states.states + i;
1521
1522 if (smp->states.slist_used > 0) {
1523 sp->trans[syms].lhs = smp->id;
1524 rhs = _ure_add_state(smp->states.slist_used,
1525 smp->states.slist, b);
1526 /*
1527 * Reset the state pointer in case the reallocation moves it
1528 * in memory.
1529 */
1530 sp = b->states.states + i;
1531 sp->trans[syms].rhs = rhs;
1532
1533 smp->states.slist_used = 0;
1534 syms++;
1535 }
1536 }
1537
1538 /*
1539 * Set the number of transitions actually used.
1540 */
1541 sp->trans_used = syms;
1542 }
1543 b->reducing = 0;
1544 }
1545
1546 static void
_ure_add_equiv(ucs2_t l,ucs2_t r,_ure_buffer_t * b)1547 _ure_add_equiv(ucs2_t l, ucs2_t r, _ure_buffer_t *b)
1548 {
1549 ucs2_t tmp;
1550
1551 l = b->states.states[l].id;
1552 r = b->states.states[r].id;
1553
1554 if (l == r)
1555 return;
1556
1557 if (l > r) {
1558 tmp = l;
1559 l = r;
1560 r = tmp;
1561 }
1562
1563 /*
1564 * Check to see if the equivalence pair already exists.
1565 */
1566 for (tmp = 0; tmp < b->equiv_used &&
1567 (b->equiv[tmp].l != l || b->equiv[tmp].r != r);
1568 tmp++) ;
1569
1570 if (tmp < b->equiv_used)
1571 return;
1572
1573 if (b->equiv_used == b->equiv_size) {
1574 if (b->equiv_size == 0)
1575 b->equiv = (_ure_equiv_t *) malloc(sizeof(_ure_equiv_t) << 3);
1576 else
1577 b->equiv = (_ure_equiv_t *) realloc((char *) b->equiv,
1578 sizeof(_ure_equiv_t) *
1579 (b->equiv_size + 8));
1580 b->equiv_size += 8;
1581 }
1582 b->equiv[b->equiv_used].l = l;
1583 b->equiv[b->equiv_used].r = r;
1584 b->equiv_used++;
1585 }
1586
1587 /*
1588 * Merge the DFA states that are equivalent.
1589 */
1590 static void
_ure_merge_equiv(_ure_buffer_t * b)1591 _ure_merge_equiv(_ure_buffer_t *b)
1592 {
1593 ucs2_t i, j, k, eq, done;
1594 _ure_state_t *sp1, *sp2, *ls, *rs;
1595
1596 for (i = 0; i < b->states.states_used; i++) {
1597 sp1 = b->states.states + i;
1598 if (sp1->id != i)
1599 continue;
1600 for (j = 0; j < i; j++) {
1601 sp2 = b->states.states + j;
1602 if (sp2->id != j)
1603 continue;
1604 b->equiv_used = 0;
1605 _ure_add_equiv(i, j, b);
1606 for (eq = 0, done = 0; eq < b->equiv_used; eq++) {
1607 ls = b->states.states + b->equiv[eq].l;
1608 rs = b->states.states + b->equiv[eq].r;
1609 if (ls->accepting != rs->accepting ||
1610 ls->trans_used != rs->trans_used) {
1611 done = 1;
1612 break;
1613 }
1614 for (k = 0; k < ls->trans_used &&
1615 ls->trans[k].lhs == rs->trans[k].lhs; k++) ;
1616 if (k < ls->trans_used) {
1617 done = 1;
1618 break;
1619 }
1620
1621 for (k = 0; k < ls->trans_used; k++)
1622 _ure_add_equiv(ls->trans[k].rhs, rs->trans[k].rhs, b);
1623 }
1624 if (done == 0)
1625 break;
1626 }
1627 for (eq = 0; j < i && eq < b->equiv_used; eq++)
1628 b->states.states[b->equiv[eq].r].id =
1629 b->states.states[b->equiv[eq].l].id;
1630 }
1631
1632 /*
1633 * Renumber the states appropriately.
1634 */
1635 for (i = eq = 0, sp1 = b->states.states; i < b->states.states_used;
1636 sp1++, i++)
1637 sp1->id = (sp1->id == i) ? eq++ : b->states.states[sp1->id].id;
1638 }
1639
1640 /*************************************************************************
1641 *
1642 * API.
1643 *
1644 *************************************************************************/
1645
1646 ure_buffer_t
ure_buffer_create(void)1647 ure_buffer_create(void)
1648 {
1649 ure_buffer_t b;
1650
1651 b = (ure_buffer_t) calloc(1, sizeof(_ure_buffer_t));
1652
1653 return b;
1654 }
1655
1656 void
ure_buffer_free(ure_buffer_t buf)1657 ure_buffer_free(ure_buffer_t buf)
1658 {
1659 unsigned long i;
1660
1661 if (buf == 0)
1662 return;
1663
1664 if (buf->stack.slist_size > 0)
1665 free((char *) buf->stack.slist);
1666
1667 if (buf->expr_size > 0)
1668 free((char *) buf->expr);
1669
1670 for (i = 0; i < buf->symtab_size; i++) {
1671 if (buf->symtab[i].states.slist_size > 0)
1672 free((char *) buf->symtab[i].states.slist);
1673 }
1674
1675 if (buf->symtab_size > 0)
1676 free((char *) buf->symtab);
1677
1678 for (i = 0; i < buf->states.states_size; i++) {
1679 if (buf->states.states[i].trans_size > 0)
1680 free((char *) buf->states.states[i].trans);
1681 if (buf->states.states[i].st.slist_size > 0)
1682 free((char *) buf->states.states[i].st.slist);
1683 }
1684
1685 if (buf->states.states_size > 0)
1686 free((char *) buf->states.states);
1687
1688 if (buf->equiv_size > 0)
1689 free((char *) buf->equiv);
1690
1691 free((char *) buf);
1692 }
1693
1694 ure_dfa_t
ure_compile(ucs2_t * re,unsigned long relen,int casefold,ure_buffer_t buf)1695 ure_compile(ucs2_t *re, unsigned long relen, int casefold, ure_buffer_t buf)
1696 {
1697 ucs2_t i, j, state;
1698 _ure_state_t *sp;
1699 _ure_dstate_t *dsp;
1700 _ure_trans_t *tp;
1701 ure_dfa_t dfa;
1702
1703 if (re == 0 || *re == 0 || relen == 0 || buf == 0)
1704 return 0;
1705
1706 /*
1707 * Reset the various fields of the compilation buffer. Default the flags
1708 * to indicate the presence of the "^$" pattern. If any other pattern
1709 * occurs, then this flag will be removed. This is done to catch this
1710 * special pattern and handle it specially when matching.
1711 */
1712 buf->flags = _URE_DFA_BLANKLINE | ((casefold) ? _URE_DFA_CASEFOLD : 0);
1713 buf->reducing = 0;
1714 buf->stack.slist_used = 0;
1715 buf->expr_used = 0;
1716
1717 for (i = 0; i < buf->symtab_used; i++)
1718 buf->symtab[i].states.slist_used = 0;
1719 buf->symtab_used = 0;
1720
1721 for (i = 0; i < buf->states.states_used; i++) {
1722 buf->states.states[i].st.slist_used = 0;
1723 buf->states.states[i].trans_used = 0;
1724 }
1725 buf->states.states_used = 0;
1726
1727 /*
1728 * Construct the NFA. If this stage returns a 0, then an error occurred or
1729 * an empty expression was passed.
1730 */
1731 if ((state = _ure_re2nfa(re, relen, buf)) == _URE_NOOP)
1732 return 0;
1733
1734 /*
1735 * Do the expression reduction to get the initial DFA.
1736 */
1737 _ure_reduce(state, buf);
1738
1739 /*
1740 * Merge all the equivalent DFA states.
1741 */
1742 _ure_merge_equiv(buf);
1743
1744 /*
1745 * Construct the minimal DFA.
1746 */
1747 dfa = (ure_dfa_t) malloc(sizeof(_ure_dfa_t));
1748 (void) memset((char *) dfa, '\0', sizeof(_ure_dfa_t));
1749
1750 dfa->flags = buf->flags & (_URE_DFA_CASEFOLD|_URE_DFA_BLANKLINE);
1751
1752 /*
1753 * Free up the NFA state groups and transfer the symbols from the buffer
1754 * to the DFA.
1755 */
1756 for (i = 0; i < buf->symtab_size; i++) {
1757 if (buf->symtab[i].states.slist_size > 0)
1758 free((char *) buf->symtab[i].states.slist);
1759 }
1760 dfa->syms = buf->symtab;
1761 dfa->nsyms = buf->symtab_used;
1762
1763 buf->symtab_used = buf->symtab_size = 0;
1764
1765 /*
1766 * Collect the total number of states and transitions needed for the DFA.
1767 */
1768 for (i = state = 0, sp = buf->states.states; i < buf->states.states_used;
1769 i++, sp++) {
1770 if (sp->id == state) {
1771 dfa->nstates++;
1772 dfa->ntrans += sp->trans_used;
1773 state++;
1774 }
1775 }
1776
1777 /*
1778 * Allocate enough space for the states and transitions.
1779 */
1780 dfa->states = (_ure_dstate_t *) malloc(sizeof(_ure_dstate_t) *
1781 dfa->nstates);
1782 dfa->trans = (_ure_trans_t *) malloc(sizeof(_ure_trans_t) * dfa->ntrans);
1783
1784 /*
1785 * Actually transfer the DFA states from the buffer.
1786 */
1787 dsp = dfa->states;
1788 tp = dfa->trans;
1789 for (i = state = 0, sp = buf->states.states; i < buf->states.states_used;
1790 i++, sp++) {
1791 if (sp->id == state) {
1792 dsp->trans = tp;
1793 dsp->ntrans = sp->trans_used;
1794 dsp->accepting = sp->accepting;
1795
1796 /*
1797 * Add the transitions for the state.
1798 */
1799 for (j = 0; j < dsp->ntrans; j++, tp++) {
1800 tp->symbol = sp->trans[j].lhs;
1801 tp->next_state = buf->states.states[sp->trans[j].rhs].id;
1802 }
1803
1804 dsp++;
1805 state++;
1806 }
1807 }
1808
1809 return dfa;
1810 }
1811
1812 void
ure_dfa_free(ure_dfa_t dfa)1813 ure_dfa_free(ure_dfa_t dfa)
1814 {
1815 ucs2_t i;
1816
1817 if (dfa == 0)
1818 return;
1819
1820 for (i = 0; i < dfa->nsyms; i++) {
1821 if ((dfa->syms[i].type == _URE_CCLASS ||
1822 dfa->syms[i].type == _URE_NCCLASS) &&
1823 dfa->syms[i].sym.ccl.ranges_size > 0)
1824 free((char *) dfa->syms[i].sym.ccl.ranges);
1825 }
1826 if (dfa->nsyms > 0)
1827 free((char *) dfa->syms);
1828
1829 if (dfa->nstates > 0)
1830 free((char *) dfa->states);
1831 if (dfa->ntrans > 0)
1832 free((char *) dfa->trans);
1833 free((char *) dfa);
1834 }
1835
1836 void
ure_write_dfa(ure_dfa_t dfa,FILE * out)1837 ure_write_dfa(ure_dfa_t dfa, FILE *out)
1838 {
1839 ucs2_t i, j, k, h, l;
1840 _ure_dstate_t *sp;
1841 _ure_symtab_t *sym;
1842 _ure_range_t *rp;
1843
1844 if (dfa == 0 || out == 0)
1845 return;
1846
1847 /*
1848 * Write all the different character classes.
1849 */
1850 for (i = 0, sym = dfa->syms; i < dfa->nsyms; i++, sym++) {
1851 if (sym->type == _URE_CCLASS || sym->type == _URE_NCCLASS) {
1852 fprintf(out, "C%hd = ", sym->id);
1853 if (sym->sym.ccl.ranges_used > 0) {
1854 putc('[', out);
1855 if (sym->type == _URE_NCCLASS)
1856 putc('^', out);
1857 }
1858 if (sym->props != 0) {
1859 if (sym->type == _URE_NCCLASS)
1860 fprintf(out, "\\P");
1861 else
1862 fprintf(out, "\\p");
1863 for (k = h = 0; k < 32; k++) {
1864 if (sym->props & (1 << k)) {
1865 if (h != 0)
1866 putc(',', out);
1867 fprintf(out, "%d", k + 1);
1868 h = 1;
1869 }
1870 }
1871 }
1872 /*
1873 * Dump the ranges.
1874 */
1875 for (k = 0, rp = sym->sym.ccl.ranges;
1876 k < sym->sym.ccl.ranges_used; k++, rp++) {
1877 /*
1878 * Check for UTF16 characters.
1879 */
1880 if (0x10000 <= rp->min_code &&
1881 rp->min_code <= 0x10ffff) {
1882 h = (ucs2_t) (((rp->min_code - 0x10000) >> 10) + 0xd800);
1883 l = (ucs2_t) (((rp->min_code - 0x10000) & 1023) + 0xdc00);
1884 fprintf(out, "\\x%04hX\\x%04hX", h, l);
1885 } else
1886 fprintf(out, "\\x%04lX", rp->min_code & 0xffff);
1887 if (rp->max_code != rp->min_code) {
1888 putc('-', out);
1889 if (rp->max_code >= 0x10000 &&
1890 rp->max_code <= 0x10ffff) {
1891 h = (ucs2_t) (((rp->max_code - 0x10000) >> 10) + 0xd800);
1892 l = (ucs2_t) (((rp->max_code - 0x10000) & 1023) + 0xdc00);
1893 fprintf(out, "\\x%04hX\\x%04hX", h, l);
1894 } else
1895 fprintf(out, "\\x%04lX", rp->max_code & 0xffff);
1896 }
1897 }
1898 if (sym->sym.ccl.ranges_used > 0)
1899 putc(']', out);
1900 putc('\n', out);
1901 }
1902 }
1903
1904 for (i = 0, sp = dfa->states; i < dfa->nstates; i++, sp++) {
1905 fprintf(out, "S%hd = ", i);
1906 if (sp->accepting) {
1907 fprintf(out, "1 ");
1908 if (sp->ntrans)
1909 fprintf(out, "| ");
1910 }
1911 for (j = 0; j < sp->ntrans; j++) {
1912 if (j > 0)
1913 fprintf(out, "| ");
1914
1915 sym = dfa->syms + sp->trans[j].symbol;
1916 switch (sym->type) {
1917 case _URE_CHAR:
1918 if (0x10000 <= sym->sym.chr && sym->sym.chr <= 0x10ffff) {
1919 /*
1920 * Take care of UTF16 characters.
1921 */
1922 h = (ucs2_t) (((sym->sym.chr - 0x10000) >> 10) + 0xd800);
1923 l = (ucs2_t) (((sym->sym.chr - 0x10000) & 1023) + 0xdc00);
1924 fprintf(out, "\\x%04hX\\x%04hX ", h, l);
1925 } else
1926 fprintf(out, "\\x%04lX ", sym->sym.chr & 0xffff);
1927 break;
1928 case _URE_ANY_CHAR:
1929 fprintf(out, "<any> ");
1930 break;
1931 case _URE_BOL_ANCHOR:
1932 fprintf(out, "<bol-anchor> ");
1933 break;
1934 case _URE_EOL_ANCHOR:
1935 fprintf(out, "<eol-anchor> ");
1936 break;
1937 case _URE_CCLASS:
1938 case _URE_NCCLASS:
1939 fprintf(out, "[C%hd] ", sym->id);
1940 break;
1941 }
1942 fprintf(out, "S%hd", sp->trans[j].next_state);
1943 if (j + 1 < sp->ntrans)
1944 putc(' ', out);
1945 }
1946 putc('\n', out);
1947 }
1948 }
1949
1950 #define _ure_issep(cc) ((cc) == '\n' || (cc) == '\r' || (cc) == 0x2028 ||\
1951 (cc) == 0x2029)
1952
1953 int
ure_exec(ure_dfa_t dfa,int flags,ucs2_t * text,unsigned long textlen,unsigned long * match_start,unsigned long * match_end)1954 ure_exec(ure_dfa_t dfa, int flags, ucs2_t *text, unsigned long textlen,
1955 unsigned long *match_start, unsigned long *match_end)
1956 {
1957 int i, j, matched, found, skip;
1958 unsigned long ms, me;
1959 ucs4_t c;
1960 ucs2_t *sp, *ep, *lp;
1961 _ure_dstate_t *stp;
1962 _ure_symtab_t *sym;
1963 _ure_range_t *rp;
1964
1965 if (dfa == 0 || text == 0)
1966 return 0;
1967
1968 /*
1969 * Handle the special case of an empty string matching the "^$" pattern.
1970 */
1971 if (textlen == 0 && (dfa->flags & _URE_DFA_BLANKLINE)) {
1972 *match_start = *match_end = 0;
1973 return 1;
1974 }
1975
1976 sp = text;
1977 ep = sp + textlen;
1978
1979 ms = me = ~0;
1980
1981 stp = dfa->states;
1982
1983 for (found = skip = 0; found == 0 && sp < ep; ) {
1984 lp = sp;
1985 c = *sp++;
1986
1987 /*
1988 * Check to see if this is a high surrogate that should be
1989 * combined with a following low surrogate.
1990 */
1991 if (sp < ep && 0xd800 <= c && c <= 0xdbff &&
1992 0xdc00 <= *sp && *sp <= 0xdfff)
1993 c = 0x10000 + (((c & 0x03ff) << 10) | (*sp++ & 0x03ff));
1994
1995 /*
1996 * Determine if the character is non-spacing and should be skipped.
1997 */
1998 if (_ure_matches_properties(_URE_NONSPACING, c) &&
1999 (flags & URE_IGNORE_NONSPACING)) {
2000 sp++;
2001 continue;
2002 }
2003
2004 if (dfa->flags & _URE_DFA_CASEFOLD)
2005 c = _ure_tolower(c);
2006
2007 /*
2008 * See if one of the transitions matches.
2009 */
2010 for (i = 0, matched = 0; matched == 0 && i < stp->ntrans; i++) {
2011 sym = dfa->syms + stp->trans[i].symbol;
2012 switch (sym->type) {
2013 case _URE_ANY_CHAR:
2014 if ((flags & URE_DOT_MATCHES_SEPARATORS) ||
2015 !_ure_issep(c))
2016 matched = 1;
2017 break;
2018 case _URE_CHAR:
2019 if (c == sym->sym.chr)
2020 matched = 1;
2021 break;
2022 case _URE_BOL_ANCHOR:
2023 if (lp == text) {
2024 sp = lp;
2025 matched = 1;
2026 } else if (_ure_issep(c)) {
2027 if (c == '\r' && sp < ep && *sp == '\n')
2028 sp++;
2029 lp = sp;
2030 matched = 1;
2031 }
2032 break;
2033 case _URE_EOL_ANCHOR:
2034 if (_ure_issep(c)) {
2035 /*
2036 * Put the pointer back before the separator so the match
2037 * end position will be correct. This case will also
2038 * cause the `sp' pointer to be advanced over the current
2039 * separator once the match end point has been recorded.
2040 */
2041 sp = lp;
2042 matched = 1;
2043 }
2044 break;
2045 case _URE_CCLASS:
2046 case _URE_NCCLASS:
2047 if (sym->props != 0)
2048 matched = _ure_matches_properties(sym->props, c);
2049 for (j = 0, rp = sym->sym.ccl.ranges;
2050 j < sym->sym.ccl.ranges_used; j++, rp++) {
2051 if (rp->min_code <= c && c <= rp->max_code)
2052 matched = 1;
2053 }
2054 if (sym->type == _URE_NCCLASS)
2055 matched = !matched;
2056 break;
2057 }
2058
2059 if (matched) {
2060 if (ms == ~0UL)
2061 ms = lp - text;
2062 else
2063 me = sp - text;
2064 stp = dfa->states + stp->trans[i].next_state;
2065
2066 /*
2067 * If the match was an EOL anchor, adjust the pointer past the
2068 * separator that caused the match. The correct match
2069 * position has been recorded already.
2070 */
2071 if (sym->type == _URE_EOL_ANCHOR) {
2072 /*
2073 * Skip the character that caused the match.
2074 */
2075 sp++;
2076
2077 /*
2078 * Handle the infamous CRLF situation.
2079 */
2080 if (sp < ep && c == '\r' && *sp == '\n')
2081 sp++;
2082 }
2083 }
2084 }
2085
2086 if (matched == 0) {
2087 if (stp->accepting == 0) {
2088 /*
2089 * If the last state was not accepting, then reset
2090 * and start over.
2091 */
2092 stp = dfa->states;
2093 ms = me = ~0;
2094 } else
2095 /*
2096 * The last state was accepting, so terminate the matching
2097 * loop to avoid more work.
2098 */
2099 found = 1;
2100 } else if (sp == ep) {
2101 if (!stp->accepting) {
2102 /*
2103 * This ugly hack is to make sure the end-of-line anchors
2104 * match when the source text hits the end. This is only done
2105 * if the last subexpression matches.
2106 */
2107 for (i = 0; found == 0 && i < stp->ntrans; i++) {
2108 sym = dfa->syms + stp->trans[i].symbol;
2109 if (sym->type ==_URE_EOL_ANCHOR) {
2110 stp = dfa->states + stp->trans[i].next_state;
2111 if (stp->accepting) {
2112 me = sp - text;
2113 found = 1;
2114 } else
2115 break;
2116 }
2117 }
2118 } else {
2119 /*
2120 * Make sure any conditions that match all the way to the end
2121 * of the string match.
2122 */
2123 found = 1;
2124 me = sp - text;
2125 }
2126 }
2127 }
2128
2129 if (found == 0)
2130 ms = me = ~0;
2131
2132 *match_start = ms;
2133 *match_end = me;
2134
2135 return (ms != ~0UL) ? 1 : 0;
2136 }
2137