xref: /netbsd-src/external/bsd/openldap/dist/libraries/liblunicode/ure/ure.c (revision 549b59ed3ccf0d36d3097190a0db27b770f3a839)
1 /*	$NetBSD: ure.c,v 1.3 2021/08/14 16:14:57 christos Exp $	*/
2 
3 /* $OpenLDAP$ */
4 /* This work is part of OpenLDAP Software <http://www.openldap.org/>.
5  *
6  * Copyright 1998-2021 The OpenLDAP Foundation.
7  * All rights reserved.
8  *
9  * Redistribution and use in source and binary forms, with or without
10  * modification, are permitted only as authorized by the OpenLDAP
11  * Public License.
12  *
13  * A copy of this license is available in file LICENSE in the
14  * top-level directory of the distribution or, alternatively, at
15  * <http://www.OpenLDAP.org/license.html>.
16  */
17 /* Copyright 1997, 1998, 1999 Computing Research Labs,
18  * New Mexico State University
19  *
20  * Permission is hereby granted, free of charge, to any person obtaining a
21  * copy of this software and associated documentation files (the "Software"),
22  * to deal in the Software without restriction, including without limitation
23  * the rights to use, copy, modify, merge, publish, distribute, sublicense,
24  * and/or sell copies of the Software, and to permit persons to whom the
25  * Software is furnished to do so, subject to the following conditions:
26  *
27  * The above copyright notice and this permission notice shall be included in
28  * all copies or substantial portions of the Software.
29  *
30  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
31  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
32  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
33  * THE COMPUTING RESEARCH LAB OR NEW MEXICO STATE UNIVERSITY BE LIABLE FOR ANY
34  * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT
35  * OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR
36  * THE USE OR OTHER DEALINGS IN THE SOFTWARE.
37  */
38 /* Id: ure.c,v 1.2 1999/09/21 15:47:43 mleisher Exp " */
39 
40 #include <sys/cdefs.h>
41 __RCSID("$NetBSD: ure.c,v 1.3 2021/08/14 16:14:57 christos Exp $");
42 
43 #include "portable.h"
44 
45 #include <ac/stdlib.h>
46 #include <ac/string.h>
47 #include <ac/unistd.h>
48 
49 #include "ure.h"
50 
51 /*
52  * Flags used internally in the DFA.
53  */
54 #define _URE_DFA_CASEFOLD  0x01
55 #define _URE_DFA_BLANKLINE 0x02
56 
57 static unsigned long cclass_flags[] = {
58     0,
59     _URE_NONSPACING,
60     _URE_COMBINING,
61     _URE_NUMDIGIT,
62     _URE_NUMOTHER,
63     _URE_SPACESEP,
64     _URE_LINESEP,
65     _URE_PARASEP,
66     _URE_CNTRL,
67     _URE_PUA,
68     _URE_UPPER,
69     _URE_LOWER,
70     _URE_TITLE,
71     _URE_MODIFIER,
72     _URE_OTHERLETTER,
73     _URE_DASHPUNCT,
74     _URE_OPENPUNCT,
75     _URE_CLOSEPUNCT,
76     _URE_OTHERPUNCT,
77     _URE_MATHSYM,
78     _URE_CURRENCYSYM,
79     _URE_OTHERSYM,
80     _URE_LTR,
81     _URE_RTL,
82     _URE_EURONUM,
83     _URE_EURONUMSEP,
84     _URE_EURONUMTERM,
85     _URE_ARABNUM,
86     _URE_COMMONSEP,
87     _URE_BLOCKSEP,
88     _URE_SEGMENTSEP,
89     _URE_WHITESPACE,
90     _URE_OTHERNEUT,
91 };
92 
93 /*
94  * Symbol types for the DFA.
95  */
96 #define _URE_ANY_CHAR   1
97 #define _URE_CHAR       2
98 #define _URE_CCLASS     3
99 #define _URE_NCCLASS    4
100 #define _URE_BOL_ANCHOR 5
101 #define _URE_EOL_ANCHOR 6
102 
103 /*
104  * Op codes for converting the NFA to a DFA.
105  */
106 #define _URE_SYMBOL     10
107 #define _URE_PAREN      11
108 #define _URE_QUEST      12
109 #define _URE_STAR       13
110 #define _URE_PLUS       14
111 #define _URE_ONE        15
112 #define _URE_AND        16
113 #define _URE_OR         17
114 
115 #define _URE_NOOP       0xffff
116 
117 #define _URE_REGSTART 0x8000
118 #define _URE_REGEND   0x4000
119 
120 /*
121  * Structure used to handle a compacted range of characters.
122  */
123 typedef struct {
124     ucs4_t min_code;
125     ucs4_t max_code;
126 } _ure_range_t;
127 
128 typedef struct {
129     _ure_range_t *ranges;
130     ucs2_t ranges_used;
131     ucs2_t ranges_size;
132 } _ure_ccl_t;
133 
134 typedef union {
135     ucs4_t chr;
136     _ure_ccl_t ccl;
137 } _ure_sym_t;
138 
139 /*
140  * This is a general element structure used for expressions and stack
141  * elements.
142  */
143 typedef struct {
144     ucs2_t reg;
145     ucs2_t onstack;
146     ucs2_t type;
147     ucs2_t lhs;
148     ucs2_t rhs;
149 } _ure_elt_t;
150 
151 /*
152  * This is a structure used to track a list or a stack of states.
153  */
154 typedef struct {
155     ucs2_t *slist;
156     ucs2_t slist_size;
157     ucs2_t slist_used;
158 } _ure_stlist_t;
159 
160 /*
161  * Structure to track the list of unique states for a symbol
162  * during reduction.
163  */
164 typedef struct {
165     ucs2_t id;
166     ucs2_t type;
167     unsigned long mods;
168     unsigned long props;
169     _ure_sym_t sym;
170     _ure_stlist_t states;
171 } _ure_symtab_t;
172 
173 /*
174  * Structure to hold a single state.
175  */
176 typedef struct {
177     ucs2_t id;
178     ucs2_t accepting;
179     ucs2_t pad;
180     _ure_stlist_t st;
181     _ure_elt_t *trans;
182     ucs2_t trans_size;
183     ucs2_t trans_used;
184 } _ure_state_t;
185 
186 /*
187  * Structure used for keeping lists of states.
188  */
189 typedef struct {
190     _ure_state_t *states;
191     ucs2_t states_size;
192     ucs2_t states_used;
193 } _ure_statetable_t;
194 
195 /*
196  * Structure to track pairs of DFA states when equivalent states are
197  * merged.
198  */
199 typedef struct {
200     ucs2_t l;
201     ucs2_t r;
202 } _ure_equiv_t;
203 
204 /*
205  * Structure used for constructing the NFA and reducing to a minimal DFA.
206  */
207 typedef struct _ure_buffer_t {
208     int reducing;
209     int error;
210     unsigned long flags;
211 
212     _ure_stlist_t stack;
213 
214     /*
215      * Table of unique symbols encountered.
216      */
217     _ure_symtab_t *symtab;
218     ucs2_t symtab_size;
219     ucs2_t symtab_used;
220 
221     /*
222      * Tracks the unique expressions generated for the NFA and when the NFA is
223      * reduced.
224      */
225     _ure_elt_t *expr;
226     ucs2_t expr_used;
227     ucs2_t expr_size;
228 
229     /*
230      * The reduced table of unique groups of NFA states.
231      */
232     _ure_statetable_t states;
233 
234     /*
235      * Tracks states when equivalent states are merged.
236      */
237     _ure_equiv_t *equiv;
238     ucs2_t equiv_used;
239     ucs2_t equiv_size;
240 } _ure_buffer_t;
241 
242 typedef struct {
243     ucs2_t symbol;
244     ucs2_t next_state;
245 } _ure_trans_t;
246 
247 typedef struct {
248     ucs2_t accepting;
249     ucs2_t ntrans;
250     _ure_trans_t *trans;
251 } _ure_dstate_t;
252 
253 typedef struct _ure_dfa_t {
254     unsigned long flags;
255 
256     _ure_symtab_t *syms;
257     ucs2_t nsyms;
258 
259     _ure_dstate_t *states;
260     ucs2_t nstates;
261 
262     _ure_trans_t *trans;
263     ucs2_t ntrans;
264 } _ure_dfa_t;
265 
266 /*************************************************************************
267  *
268  * Functions.
269  *
270  *************************************************************************/
271 
272 static void
_ure_memmove(char * dest,char * src,unsigned long bytes)273 _ure_memmove(char *dest, char *src, unsigned long bytes)
274 {
275     long i, j;
276 
277     i = (long) bytes;
278     j = i & 7;
279     i = (i + 7) >> 3;
280 
281     /*
282      * Do a memmove using Ye Olde Duff's Device for efficiency.
283      */
284     if (src < dest) {
285         src += bytes;
286         dest += bytes;
287 
288         switch (j) {
289           case 0: do {
290               *--dest = *--src;
291             case 7: *--dest = *--src;
292             case 6: *--dest = *--src;
293             case 5: *--dest = *--src;
294             case 4: *--dest = *--src;
295             case 3: *--dest = *--src;
296             case 2: *--dest = *--src;
297             case 1: *--dest = *--src;
298           } while (--i > 0);
299         }
300     } else if (src > dest) {
301         switch (j) {
302           case 0: do {
303               *dest++ = *src++;
304             case 7: *dest++ = *src++;
305             case 6: *dest++ = *src++;
306             case 5: *dest++ = *src++;
307             case 4: *dest++ = *src++;
308             case 3: *dest++ = *src++;
309             case 2: *dest++ = *src++;
310             case 1: *dest++ = *src++;
311           } while (--i > 0);
312         }
313     }
314 }
315 
316 static void
_ure_push(ucs2_t v,_ure_buffer_t * b)317 _ure_push(ucs2_t v, _ure_buffer_t *b)
318 {
319     _ure_stlist_t *s;
320 
321     if (b == 0)
322       return;
323 
324     /*
325      * If the `reducing' parameter is non-zero, check to see if the value
326      * passed is already on the stack.
327      */
328     if (b->reducing != 0 && b->expr[v].onstack != 0)
329       return;
330 
331     s = &b->stack;
332     if (s->slist_used == s->slist_size) {
333         if (s->slist_size == 0)
334           s->slist = (ucs2_t *) malloc(sizeof(ucs2_t) << 3);
335         else
336           s->slist = (ucs2_t *) realloc((char *) s->slist,
337                                         sizeof(ucs2_t) * (s->slist_size + 8));
338         s->slist_size += 8;
339     }
340     s->slist[s->slist_used++] = v;
341 
342     /*
343      * If the `reducing' parameter is non-zero, flag the element as being on
344      * the stack.
345      */
346     if (b->reducing != 0)
347       b->expr[v].onstack = 1;
348 }
349 
350 static ucs2_t
_ure_peek(_ure_buffer_t * b)351 _ure_peek(_ure_buffer_t *b)
352 {
353     if (b == 0 || b->stack.slist_used == 0)
354       return _URE_NOOP;
355 
356     return b->stack.slist[b->stack.slist_used - 1];
357 }
358 
359 static ucs2_t
_ure_pop(_ure_buffer_t * b)360 _ure_pop(_ure_buffer_t *b)
361 {
362     ucs2_t v;
363 
364     if (b == 0 || b->stack.slist_used == 0)
365       return _URE_NOOP;
366 
367     v = b->stack.slist[--b->stack.slist_used];
368     if (b->reducing)
369       b->expr[v].onstack = 0;
370 
371     return v;
372 }
373 
374 /*************************************************************************
375  *
376  * Start symbol parse functions.
377  *
378  *************************************************************************/
379 
380 /*
381  * Parse a comma-separated list of integers that represent character
382  * properties.  Combine them into a mask that is returned in the `mask'
383  * variable, and return the number of characters consumed.
384  */
385 static unsigned long
_ure_prop_list(ucs2_t * pp,unsigned long limit,unsigned long * mask,_ure_buffer_t * b)386 _ure_prop_list(ucs2_t *pp, unsigned long limit, unsigned long *mask,
387                _ure_buffer_t *b)
388 {
389     unsigned long n, m;
390     ucs2_t *sp, *ep;
391 
392     sp = pp;
393     ep = sp + limit;
394 
395     for (m = n = 0; b->error == _URE_OK && sp < ep; sp++) {
396         if (*sp == ',') {
397             /*
398              * Encountered a comma, so select the next character property flag
399              * and reset the number.
400              */
401             m |= cclass_flags[n];
402             n = 0;
403         } else if (*sp >= '0' && *sp <= '9')
404           /*
405            * Encountered a digit, so start or continue building the cardinal
406            * that represents the character property flag.
407            */
408           n = (n * 10) + (*sp - '0');
409         else
410           /*
411            * Encountered something that is not part of the property list.
412            * Indicate that we are done.
413            */
414           break;
415 
416         /*
417          * If a property number greater than 32 occurs, then there is a
418          * problem.  Most likely a missing comma separator.
419          */
420         if (n > 32)
421           b->error = _URE_INVALID_PROPERTY;
422     }
423 
424     if (b->error == _URE_OK && n != 0)
425       m |= cclass_flags[n];
426 
427     /*
428      * Set the mask that represents the group of character properties.
429      */
430     *mask = m;
431 
432     /*
433      * Return the number of characters consumed.
434      */
435     return sp - pp;
436 }
437 
438 /*
439  * Collect a hex number with 1 to 4 digits and return the number
440  * of characters used.
441  */
442 static unsigned long
_ure_hex(ucs2_t * np,unsigned long limit,ucs4_t * n)443 _ure_hex(ucs2_t *np, unsigned long limit, ucs4_t *n)
444 {
445     ucs2_t i;
446     ucs2_t *sp, *ep;
447     ucs4_t nn;
448 
449     sp = np;
450     ep = sp + limit;
451 
452     for (nn = 0, i = 0; i < 4 && sp < ep; i++, sp++) {
453         if (*sp >= '0' && *sp <= '9')
454           nn = (nn << 4) + (*sp - '0');
455         else if (*sp >= 'A' && *sp <= 'F')
456           nn = (nn << 4) + ((*sp - 'A') + 10);
457         else if (*sp >= 'a' && *sp <= 'f')
458           nn = (nn << 4) + ((*sp - 'a') + 10);
459         else
460           /*
461            * Encountered something that is not a hex digit.
462            */
463           break;
464     }
465 
466     /*
467      * Assign the character code collected and return the number of
468      * characters used.
469      */
470     *n = nn;
471 
472     return sp - np;
473 }
474 
475 /*
476  * Insert a range into a character class, removing duplicates and ordering
477  * them in increasing range-start order.
478  */
479 static void
_ure_add_range(_ure_ccl_t * ccl,_ure_range_t * r,_ure_buffer_t * b)480 _ure_add_range(_ure_ccl_t *ccl, _ure_range_t *r, _ure_buffer_t *b)
481 {
482     ucs2_t i;
483     ucs4_t tmp;
484     _ure_range_t *rp;
485 
486     /*
487      * If the `casefold' flag is set, then make sure both endpoints of the
488      * range are converted to lower case.
489      */
490     if (b->flags & _URE_DFA_CASEFOLD) {
491         r->min_code = _ure_tolower(r->min_code);
492         r->max_code = _ure_tolower(r->max_code);
493     }
494 
495     /*
496      * Swap the range endpoints if they are not in increasing order.
497      */
498     if (r->min_code > r->max_code) {
499         tmp = r->min_code;
500         r->min_code = r->max_code;
501         r->max_code = tmp;
502     }
503 
504     for (i = 0, rp = ccl->ranges;
505          i < ccl->ranges_used && r->min_code < rp->min_code; i++, rp++) ;
506 
507     /*
508      * Check for a duplicate.
509      */
510     if (i < ccl->ranges_used &&
511         r->min_code == rp->min_code && r->max_code == rp->max_code)
512       return;
513 
514     if (ccl->ranges_used == ccl->ranges_size) {
515         if (ccl->ranges_size == 0)
516           ccl->ranges = (_ure_range_t *) malloc(sizeof(_ure_range_t) << 3);
517         else
518           ccl->ranges = (_ure_range_t *)
519               realloc((char *) ccl->ranges,
520                       sizeof(_ure_range_t) * (ccl->ranges_size + 8));
521         ccl->ranges_size += 8;
522     }
523 
524     rp = ccl->ranges + ccl->ranges_used;
525 
526     if (i < ccl->ranges_used)
527       _ure_memmove((char *) (rp + 1), (char *) rp,
528                    sizeof(_ure_range_t) * (ccl->ranges_used - i));
529 
530     ccl->ranges_used++;
531     rp->min_code = r->min_code;
532     rp->max_code = r->max_code;
533 }
534 
535 #define _URE_ALPHA_MASK  (_URE_UPPER|_URE_LOWER|_URE_OTHERLETTER|\
536 _URE_MODIFIER|_URE_TITLE|_URE_NONSPACING|_URE_COMBINING)
537 #define _URE_ALNUM_MASK  (_URE_ALPHA_MASK|_URE_NUMDIGIT)
538 #define _URE_PUNCT_MASK  (_URE_DASHPUNCT|_URE_OPENPUNCT|_URE_CLOSEPUNCT|\
539 _URE_OTHERPUNCT)
540 #define _URE_GRAPH_MASK (_URE_NUMDIGIT|_URE_NUMOTHER|_URE_ALPHA_MASK|\
541 _URE_MATHSYM|_URE_CURRENCYSYM|_URE_OTHERSYM)
542 #define _URE_PRINT_MASK (_URE_GRAPH_MASK|_URE_SPACESEP)
543 #define _URE_SPACE_MASK  (_URE_SPACESEP|_URE_LINESEP|_URE_PARASEP)
544 
545 typedef void (*_ure_cclsetup_t)(
546     _ure_symtab_t *sym,
547     unsigned long mask,
548     _ure_buffer_t *b
549 );
550 
551 typedef struct {
552     ucs2_t key;
553     unsigned long len;
554     unsigned long next;
555     _ure_cclsetup_t func;
556     unsigned long mask;
557 } _ure_trie_t;
558 
559 static void
_ure_ccl_setup(_ure_symtab_t * sym,unsigned long mask,_ure_buffer_t * b)560 _ure_ccl_setup(_ure_symtab_t *sym, unsigned long mask, _ure_buffer_t *b)
561 {
562     sym->props |= mask;
563 }
564 
565 static void
_ure_space_setup(_ure_symtab_t * sym,unsigned long mask,_ure_buffer_t * b)566 _ure_space_setup(_ure_symtab_t *sym, unsigned long mask, _ure_buffer_t *b)
567 {
568     _ure_range_t range;
569 
570     sym->props |= mask;
571 
572     /*
573      * Add the additional characters needed for handling isspace().
574      */
575     range.min_code = range.max_code = '\t';
576     _ure_add_range(&sym->sym.ccl, &range, b);
577     range.min_code = range.max_code = '\r';
578     _ure_add_range(&sym->sym.ccl, &range, b);
579     range.min_code = range.max_code = '\n';
580     _ure_add_range(&sym->sym.ccl, &range, b);
581     range.min_code = range.max_code = '\f';
582     _ure_add_range(&sym->sym.ccl, &range, b);
583     range.min_code = range.max_code = 0xfeff;
584     _ure_add_range(&sym->sym.ccl, &range, b);
585 }
586 
587 static void
_ure_xdigit_setup(_ure_symtab_t * sym,unsigned long mask,_ure_buffer_t * b)588 _ure_xdigit_setup(_ure_symtab_t *sym, unsigned long mask, _ure_buffer_t *b)
589 {
590     _ure_range_t range;
591 
592     /*
593      * Add the additional characters needed for handling isxdigit().
594      */
595     range.min_code = '0';
596     range.max_code = '9';
597     _ure_add_range(&sym->sym.ccl, &range, b);
598     range.min_code = 'A';
599     range.max_code = 'F';
600     _ure_add_range(&sym->sym.ccl, &range, b);
601     range.min_code = 'a';
602     range.max_code = 'f';
603     _ure_add_range(&sym->sym.ccl, &range, b);
604 }
605 
606 static _ure_trie_t cclass_trie[] = {
607     {0x003a, 1, 1, 0, 0},
608     {0x0061, 9, 10, 0, 0},
609     {0x0063, 8, 19, 0, 0},
610     {0x0064, 7, 24, 0, 0},
611     {0x0067, 6, 29, 0, 0},
612     {0x006c, 5, 34, 0, 0},
613     {0x0070, 4, 39, 0, 0},
614     {0x0073, 3, 49, 0, 0},
615     {0x0075, 2, 54, 0, 0},
616     {0x0078, 1, 59, 0, 0},
617     {0x006c, 1, 11, 0, 0},
618     {0x006e, 2, 13, 0, 0},
619     {0x0070, 1, 16, 0, 0},
620     {0x0075, 1, 14, 0, 0},
621     {0x006d, 1, 15, 0, 0},
622     {0x003a, 1, 16, _ure_ccl_setup, _URE_ALNUM_MASK},
623     {0x0068, 1, 17, 0, 0},
624     {0x0061, 1, 18, 0, 0},
625     {0x003a, 1, 19, _ure_ccl_setup, _URE_ALPHA_MASK},
626     {0x006e, 1, 20, 0, 0},
627     {0x0074, 1, 21, 0, 0},
628     {0x0072, 1, 22, 0, 0},
629     {0x006c, 1, 23, 0, 0},
630     {0x003a, 1, 24, _ure_ccl_setup, _URE_CNTRL},
631     {0x0069, 1, 25, 0, 0},
632     {0x0067, 1, 26, 0, 0},
633     {0x0069, 1, 27, 0, 0},
634     {0x0074, 1, 28, 0, 0},
635     {0x003a, 1, 29, _ure_ccl_setup, _URE_NUMDIGIT},
636     {0x0072, 1, 30, 0, 0},
637     {0x0061, 1, 31, 0, 0},
638     {0x0070, 1, 32, 0, 0},
639     {0x0068, 1, 33, 0, 0},
640     {0x003a, 1, 34, _ure_ccl_setup, _URE_GRAPH_MASK},
641     {0x006f, 1, 35, 0, 0},
642     {0x0077, 1, 36, 0, 0},
643     {0x0065, 1, 37, 0, 0},
644     {0x0072, 1, 38, 0, 0},
645     {0x003a, 1, 39, _ure_ccl_setup, _URE_LOWER},
646     {0x0072, 2, 41, 0, 0},
647     {0x0075, 1, 45, 0, 0},
648     {0x0069, 1, 42, 0, 0},
649     {0x006e, 1, 43, 0, 0},
650     {0x0074, 1, 44, 0, 0},
651     {0x003a, 1, 45, _ure_ccl_setup, _URE_PRINT_MASK},
652     {0x006e, 1, 46, 0, 0},
653     {0x0063, 1, 47, 0, 0},
654     {0x0074, 1, 48, 0, 0},
655     {0x003a, 1, 49, _ure_ccl_setup, _URE_PUNCT_MASK},
656     {0x0070, 1, 50, 0, 0},
657     {0x0061, 1, 51, 0, 0},
658     {0x0063, 1, 52, 0, 0},
659     {0x0065, 1, 53, 0, 0},
660     {0x003a, 1, 54, _ure_space_setup, _URE_SPACE_MASK},
661     {0x0070, 1, 55, 0, 0},
662     {0x0070, 1, 56, 0, 0},
663     {0x0065, 1, 57, 0, 0},
664     {0x0072, 1, 58, 0, 0},
665     {0x003a, 1, 59, _ure_ccl_setup, _URE_UPPER},
666     {0x0064, 1, 60, 0, 0},
667     {0x0069, 1, 61, 0, 0},
668     {0x0067, 1, 62, 0, 0},
669     {0x0069, 1, 63, 0, 0},
670     {0x0074, 1, 64, 0, 0},
671     {0x003a, 1, 65, _ure_xdigit_setup, 0},
672 };
673 
674 /*
675  * Probe for one of the POSIX colon delimited character classes in the static
676  * trie.
677  */
678 static unsigned long
_ure_posix_ccl(ucs2_t * cp,unsigned long limit,_ure_symtab_t * sym,_ure_buffer_t * b)679 _ure_posix_ccl(ucs2_t *cp, unsigned long limit, _ure_symtab_t *sym,
680                _ure_buffer_t *b)
681 {
682     int i;
683     unsigned long n;
684     _ure_trie_t *tp;
685     ucs2_t *sp, *ep;
686 
687     /*
688      * If the number of characters left is less than 7, then this cannot be
689      * interpreted as one of the colon delimited classes.
690      */
691     if (limit < 7)
692       return 0;
693 
694     sp = cp;
695     ep = sp + limit;
696     tp = cclass_trie;
697     for (i = 0; sp < ep && i < 8; i++, sp++) {
698         n = tp->len;
699 
700         for (; n > 0 && tp->key != *sp; tp++, n--) ;
701 
702         if (n == 0)
703           return 0;
704 
705         if (*sp == ':' && (i == 6 || i == 7)) {
706             sp++;
707             break;
708         }
709         if (sp + 1 < ep)
710           tp = cclass_trie + tp->next;
711     }
712     if (tp->func == 0)
713       return 0;
714 
715     (*tp->func)(sym, tp->mask, b);
716 
717     return sp - cp;
718 }
719 
720 /*
721  * Construct a list of ranges and return the number of characters consumed.
722  */
723 static unsigned long
_ure_cclass(ucs2_t * cp,unsigned long limit,_ure_symtab_t * symp,_ure_buffer_t * b)724 _ure_cclass(ucs2_t *cp, unsigned long limit, _ure_symtab_t *symp,
725             _ure_buffer_t *b)
726 {
727     int range_end;
728     unsigned long n;
729     ucs2_t *sp, *ep;
730     ucs4_t c, last;
731     _ure_ccl_t *cclp;
732     _ure_range_t range;
733 
734     sp = cp;
735     ep = sp + limit;
736 
737     if (*sp == '^') {
738       symp->type = _URE_NCCLASS;
739       sp++;
740     } else
741       symp->type = _URE_CCLASS;
742 
743     for (last = 0, range_end = 0;
744          b->error == _URE_OK && sp < ep && *sp != ']'; ) {
745         c = *sp++;
746         if (c == '\\') {
747             if (sp == ep) {
748                 /*
749                  * The EOS was encountered when expecting the reverse solidus
750                  * to be followed by the character it is escaping.  Set an
751                  * error code and return the number of characters consumed up
752                  * to this point.
753                  */
754                 b->error = _URE_UNEXPECTED_EOS;
755                 return sp - cp;
756             }
757 
758             c = *sp++;
759             switch (c) {
760               case 'a':
761                 c = 0x07;
762                 break;
763               case 'b':
764                 c = 0x08;
765                 break;
766               case 'f':
767                 c = 0x0c;
768                 break;
769               case 'n':
770                 c = 0x0a;
771                 break;
772               case 'r':
773                 c = 0x0d;
774                 break;
775               case 't':
776                 c = 0x09;
777                 break;
778               case 'v':
779                 c = 0x0b;
780                 break;
781               case 'p':
782               case 'P':
783                 sp += _ure_prop_list(sp, ep - sp, &symp->props, b);
784                 /*
785                  * Invert the bit mask of the properties if this is a negated
786                  * character class or if 'P' is used to specify a list of
787                  * character properties that should *not* match in a
788                  * character class.
789                  */
790                 if (c == 'P')
791                   symp->props = ~symp->props;
792                 continue;
793                 break;
794               case 'x':
795               case 'X':
796               case 'u':
797               case 'U':
798                 if (sp < ep &&
799                     ((*sp >= '0' && *sp <= '9') ||
800                      (*sp >= 'A' && *sp <= 'F') ||
801                      (*sp >= 'a' && *sp <= 'f')))
802                   sp += _ure_hex(sp, ep - sp, &c);
803             }
804         } else if (c == ':') {
805             /*
806              * Probe for a POSIX colon delimited character class.
807              */
808             sp--;
809             if ((n = _ure_posix_ccl(sp, ep - sp, symp, b)) == 0)
810               sp++;
811             else {
812                 sp += n;
813                 continue;
814             }
815         }
816 
817         cclp = &symp->sym.ccl;
818 
819         /*
820          * Check to see if the current character is a low surrogate that needs
821          * to be combined with a preceding high surrogate.
822          */
823         if (last != 0) {
824             if (c >= 0xdc00 && c <= 0xdfff)
825               /*
826                * Construct the UTF16 character code.
827                */
828               c = 0x10000 + (((last & 0x03ff) << 10) | (c & 0x03ff));
829             else {
830                 /*
831                  * Add the isolated high surrogate to the range.
832                  */
833                 if (range_end == 1)
834                   range.max_code = last & 0xffff;
835                 else
836                   range.min_code = range.max_code = last & 0xffff;
837 
838                 _ure_add_range(cclp, &range, b);
839                 range_end = 0;
840             }
841         }
842 
843         /*
844          * Clear the last character code.
845          */
846         last = 0;
847 
848         /*
849          * This slightly awkward code handles the different cases needed to
850          * construct a range.
851          */
852         if (c >= 0xd800 && c <= 0xdbff) {
853             /*
854              * If the high surrogate is followed by a range indicator, simply
855              * add it as the range start.  Otherwise, save it in case the next
856              * character is a low surrogate.
857              */
858             if (*sp == '-') {
859                 sp++;
860                 range.min_code = c;
861                 range_end = 1;
862             } else
863               last = c;
864         } else if (range_end == 1) {
865             range.max_code = c;
866             _ure_add_range(cclp, &range, b);
867             range_end = 0;
868         } else {
869             range.min_code = range.max_code = c;
870             if (*sp == '-') {
871                 sp++;
872                 range_end = 1;
873             } else
874               _ure_add_range(cclp, &range, b);
875         }
876     }
877 
878     if (sp < ep && *sp == ']')
879       sp++;
880     else
881       /*
882        * The parse was not terminated by the character class close symbol
883        * (']'), so set an error code.
884        */
885       b->error = _URE_CCLASS_OPEN;
886 
887     return sp - cp;
888 }
889 
890 /*
891  * Probe for a low surrogate hex code.
892  */
893 static unsigned long
_ure_probe_ls(ucs2_t * ls,unsigned long limit,ucs4_t * c)894 _ure_probe_ls(ucs2_t *ls, unsigned long limit, ucs4_t *c)
895 {
896     ucs4_t i, code;
897     ucs2_t *sp, *ep;
898 
899     for (i = code = 0, sp = ls, ep = sp + limit; i < 4 && sp < ep; sp++) {
900         if (*sp >= '0' && *sp <= '9')
901           code = (code << 4) + (*sp - '0');
902         else if (*sp >= 'A' && *sp <= 'F')
903           code = (code << 4) + ((*sp - 'A') + 10);
904         else if (*sp >= 'a' && *sp <= 'f')
905           code = (code << 4) + ((*sp - 'a') + 10);
906         else
907           break;
908     }
909 
910     *c = code;
911     return (0xdc00 <= code && code <= 0xdfff) ? sp - ls : 0;
912 }
913 
914 static unsigned long
_ure_compile_symbol(ucs2_t * sym,unsigned long limit,_ure_symtab_t * symp,_ure_buffer_t * b)915 _ure_compile_symbol(ucs2_t *sym, unsigned long limit, _ure_symtab_t *symp,
916                     _ure_buffer_t *b)
917 {
918     ucs4_t c;
919     ucs2_t *sp, *ep;
920 
921     sp = sym;
922     ep = sym + limit;
923 
924     if ((c = *sp++) == '\\') {
925 
926         if (sp == ep) {
927             /*
928              * The EOS was encountered when expecting the reverse solidus to
929              * be followed by the character it is escaping.  Set an error code
930              * and return the number of characters consumed up to this point.
931              */
932             b->error = _URE_UNEXPECTED_EOS;
933             return sp - sym;
934         }
935 
936         c = *sp++;
937         switch (c) {
938           case 'p':
939           case 'P':
940             symp->type = (c == 'p') ? _URE_CCLASS : _URE_NCCLASS;
941             sp += _ure_prop_list(sp, ep - sp, &symp->props, b);
942             break;
943           case 'a':
944             symp->type = _URE_CHAR;
945             symp->sym.chr = 0x07;
946             break;
947           case 'b':
948             symp->type = _URE_CHAR;
949             symp->sym.chr = 0x08;
950             break;
951           case 'f':
952             symp->type = _URE_CHAR;
953             symp->sym.chr = 0x0c;
954             break;
955           case 'n':
956             symp->type = _URE_CHAR;
957             symp->sym.chr = 0x0a;
958             break;
959           case 'r':
960             symp->type = _URE_CHAR;
961             symp->sym.chr = 0x0d;
962             break;
963           case 't':
964             symp->type = _URE_CHAR;
965             symp->sym.chr = 0x09;
966             break;
967           case 'v':
968             symp->type = _URE_CHAR;
969             symp->sym.chr = 0x0b;
970             break;
971           case 'x':
972           case 'X':
973           case 'u':
974           case 'U':
975             /*
976              * Collect between 1 and 4 digits representing a UCS2 code.  Fall
977              * through to the next case.
978              */
979             if (sp < ep &&
980                 ((*sp >= '0' && *sp <= '9') ||
981                  (*sp >= 'A' && *sp <= 'F') ||
982                  (*sp >= 'a' && *sp <= 'f')))
983               sp += _ure_hex(sp, ep - sp, &c);
984             /* FALLTHROUGH */
985           default:
986             /*
987              * Simply add an escaped character here.
988              */
989             symp->type = _URE_CHAR;
990             symp->sym.chr = c;
991         }
992     } else if (c == '^' || c == '$')
993       /*
994        * Handle the BOL and EOL anchors.  This actually consists simply of
995        * setting a flag that indicates that the user supplied anchor match
996        * function should be called.  This needs to be done instead of simply
997        * matching line/paragraph separators because beginning-of-text and
998        * end-of-text tests are needed as well.
999        */
1000       symp->type = (c == '^') ? _URE_BOL_ANCHOR : _URE_EOL_ANCHOR;
1001     else if (c == '[')
1002       /*
1003        * Construct a character class.
1004        */
1005       sp += _ure_cclass(sp, ep - sp, symp, b);
1006     else if (c == '.')
1007       symp->type = _URE_ANY_CHAR;
1008     else {
1009         symp->type = _URE_CHAR;
1010         symp->sym.chr = c;
1011     }
1012 
1013     /*
1014      * If the symbol type happens to be a character and is a high surrogate,
1015      * then probe forward to see if it is followed by a low surrogate that
1016      * needs to be added.
1017      */
1018     if (sp < ep && symp->type == _URE_CHAR &&
1019         0xd800 <= symp->sym.chr && symp->sym.chr <= 0xdbff) {
1020 
1021         if (0xdc00 <= *sp && *sp <= 0xdfff) {
1022             symp->sym.chr = 0x10000 + (((symp->sym.chr & 0x03ff) << 10) |
1023                                        (*sp & 0x03ff));
1024             sp++;
1025         } else if (*sp == '\\' && (*(sp + 1) == 'x' || *(sp + 1) == 'X' ||
1026                                  *(sp + 1) == 'u' || *(sp + 1) == 'U')) {
1027             sp += _ure_probe_ls(sp + 2, ep - (sp + 2), &c);
1028             if (0xdc00 <= c && c <= 0xdfff) {
1029                 /*
1030                  * Take into account the \[xu] in front of the hex code.
1031                  */
1032                 sp += 2;
1033                 symp->sym.chr = 0x10000 + (((symp->sym.chr & 0x03ff) << 10) |
1034                                            (c & 0x03ff));
1035             }
1036         }
1037     }
1038 
1039     /*
1040      * Last, make sure any _URE_CHAR type symbols are changed to lower case if
1041      * the `casefold' flag is set.
1042      */
1043     if ((b->flags & _URE_DFA_CASEFOLD) && symp->type == _URE_CHAR)
1044       symp->sym.chr = _ure_tolower(symp->sym.chr);
1045 
1046     /*
1047      * If the symbol constructed is anything other than one of the anchors,
1048      * make sure the _URE_DFA_BLANKLINE flag is removed.
1049      */
1050     if (symp->type != _URE_BOL_ANCHOR && symp->type != _URE_EOL_ANCHOR)
1051       b->flags &= ~_URE_DFA_BLANKLINE;
1052 
1053     /*
1054      * Return the number of characters consumed.
1055      */
1056     return sp - sym;
1057 }
1058 
1059 static int
_ure_sym_neq(_ure_symtab_t * a,_ure_symtab_t * b)1060 _ure_sym_neq(_ure_symtab_t *a, _ure_symtab_t *b)
1061 {
1062     if (a->type != b->type || a->mods != b->mods || a->props != b->props)
1063       return 1;
1064 
1065     if (a->type == _URE_CCLASS || a->type == _URE_NCCLASS) {
1066         if (a->sym.ccl.ranges_used != b->sym.ccl.ranges_used)
1067           return 1;
1068         if (a->sym.ccl.ranges_used > 0 &&
1069             memcmp((char *) a->sym.ccl.ranges, (char *) b->sym.ccl.ranges,
1070                    sizeof(_ure_range_t) * a->sym.ccl.ranges_used) != 0)
1071           return 1;
1072     } else if (a->type == _URE_CHAR && a->sym.chr != b->sym.chr)
1073       return 1;
1074     return 0;
1075 }
1076 
1077 /*
1078  * Construct a symbol, but only keep unique symbols.
1079  */
1080 static ucs2_t
_ure_make_symbol(ucs2_t * sym,unsigned long limit,unsigned long * consumed,_ure_buffer_t * b)1081 _ure_make_symbol(ucs2_t *sym, unsigned long limit, unsigned long *consumed,
1082                  _ure_buffer_t *b)
1083 {
1084     ucs2_t i;
1085     _ure_symtab_t *sp, symbol;
1086 
1087     /*
1088      * Build the next symbol so we can test to see if it is already in the
1089      * symbol table.
1090      */
1091     (void) memset((char *) &symbol, '\0', sizeof(_ure_symtab_t));
1092     *consumed = _ure_compile_symbol(sym, limit, &symbol, b);
1093 
1094     /*
1095      * Check to see if the symbol exists.
1096      */
1097     for (i = 0, sp = b->symtab;
1098          i < b->symtab_used && _ure_sym_neq(&symbol, sp); i++, sp++) ;
1099 
1100     if (i < b->symtab_used) {
1101         /*
1102          * Free up any ranges used for the symbol.
1103          */
1104         if ((symbol.type == _URE_CCLASS || symbol.type == _URE_NCCLASS) &&
1105             symbol.sym.ccl.ranges_size > 0)
1106           free((char *) symbol.sym.ccl.ranges);
1107 
1108         return b->symtab[i].id;
1109     }
1110 
1111     /*
1112      * Need to add the new symbol.
1113      */
1114     if (b->symtab_used == b->symtab_size) {
1115         if (b->symtab_size == 0)
1116           b->symtab = (_ure_symtab_t *) malloc(sizeof(_ure_symtab_t) << 3);
1117         else
1118           b->symtab = (_ure_symtab_t *)
1119               realloc((char *) b->symtab,
1120                       sizeof(_ure_symtab_t) * (b->symtab_size + 8));
1121         sp = b->symtab + b->symtab_size;
1122         (void) memset((char *) sp, '\0', sizeof(_ure_symtab_t) << 3);
1123         b->symtab_size += 8;
1124     }
1125 
1126     symbol.id = b->symtab_used++;
1127     (void) AC_MEMCPY((char *) &b->symtab[symbol.id], (char *) &symbol,
1128                   sizeof(_ure_symtab_t));
1129 
1130     return symbol.id;
1131 }
1132 
1133 /*************************************************************************
1134  *
1135  * End symbol parse functions.
1136  *
1137  *************************************************************************/
1138 
1139 static ucs2_t
_ure_make_expr(ucs2_t type,ucs2_t lhs,ucs2_t rhs,_ure_buffer_t * b)1140 _ure_make_expr(ucs2_t type, ucs2_t lhs, ucs2_t rhs, _ure_buffer_t *b)
1141 {
1142     ucs2_t i;
1143 
1144     if (b == 0)
1145       return _URE_NOOP;
1146 
1147     /*
1148      * Determine if the expression already exists or not.
1149      */
1150     for (i = 0; i < b->expr_used; i++) {
1151         if (b->expr[i].type == type && b->expr[i].lhs == lhs &&
1152             b->expr[i].rhs == rhs)
1153           break;
1154     }
1155     if (i < b->expr_used)
1156       return i;
1157 
1158     /*
1159      * Need to add a new expression.
1160      */
1161     if (b->expr_used == b->expr_size) {
1162         if (b->expr_size == 0)
1163           b->expr = (_ure_elt_t *) malloc(sizeof(_ure_elt_t) << 3);
1164         else
1165           b->expr = (_ure_elt_t *)
1166               realloc((char *) b->expr,
1167                       sizeof(_ure_elt_t) * (b->expr_size + 8));
1168         b->expr_size += 8;
1169     }
1170 
1171     b->expr[b->expr_used].onstack = 0;
1172     b->expr[b->expr_used].type = type;
1173     b->expr[b->expr_used].lhs = lhs;
1174     b->expr[b->expr_used].rhs = rhs;
1175 
1176     return b->expr_used++;
1177 }
1178 
1179 static unsigned char spmap[] = {
1180     0x00, 0x00, 0x00, 0x00, 0x00, 0x0f, 0x00, 0x80, 0x00, 0x00, 0x00, 0x00,
1181     0x00, 0x00, 0x00, 0x10, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
1182     0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
1183 };
1184 
1185 #define _ure_isspecial(cc) ((cc) > 0x20 && (cc) < 0x7f && \
1186                             (spmap[(cc) >> 3] & (1 << ((cc) & 7))))
1187 
1188 /*
1189  * Convert the regular expression into an NFA in a form that will be easy to
1190  * reduce to a DFA.  The starting state for the reduction will be returned.
1191  */
1192 static ucs2_t
_ure_re2nfa(ucs2_t * re,unsigned long relen,_ure_buffer_t * b)1193 _ure_re2nfa(ucs2_t *re, unsigned long relen, _ure_buffer_t *b)
1194 {
1195     ucs2_t c, state, top, sym, *sp, *ep;
1196     unsigned long used;
1197 
1198     state = _URE_NOOP;
1199 
1200     sp = re;
1201     ep = sp + relen;
1202     while (b->error == _URE_OK && sp < ep) {
1203         c = *sp++;
1204         switch (c) {
1205           case '(':
1206             _ure_push(_URE_PAREN, b);
1207             break;
1208           case ')':
1209             /*
1210              * Check for the case of too many close parentheses.
1211              */
1212             if (_ure_peek(b) == _URE_NOOP) {
1213                 b->error = _URE_UNBALANCED_GROUP;
1214                 break;
1215             }
1216 
1217             while ((top = _ure_peek(b)) == _URE_AND || top == _URE_OR)
1218               /*
1219                * Make an expression with the AND or OR operator and its right
1220                * hand side.
1221                */
1222               state = _ure_make_expr(_ure_pop(b), _ure_pop(b), state, b);
1223 
1224             /*
1225              * Remove the _URE_PAREN off the stack.
1226              */
1227             (void) _ure_pop(b);
1228             break;
1229           case '*':
1230             state = _ure_make_expr(_URE_STAR, state, _URE_NOOP, b);
1231             break;
1232           case '+':
1233             state = _ure_make_expr(_URE_PLUS, state, _URE_NOOP, b);
1234             break;
1235           case '?':
1236             state = _ure_make_expr(_URE_QUEST, state, _URE_NOOP, b);
1237             break;
1238           case '|':
1239             while ((top = _ure_peek(b)) == _URE_AND || top == _URE_OR)
1240               /*
1241                * Make an expression with the AND or OR operator and its right
1242                * hand side.
1243                */
1244               state = _ure_make_expr(_ure_pop(b), _ure_pop(b), state, b);
1245 
1246             _ure_push(state, b);
1247             _ure_push(_URE_OR, b);
1248             break;
1249           default:
1250             sp--;
1251             sym = _ure_make_symbol(sp, ep - sp, &used, b);
1252             sp += used;
1253             state = _ure_make_expr(_URE_SYMBOL, sym, _URE_NOOP, b);
1254             break;
1255         }
1256 
1257         if (c != '(' && c != '|' && sp < ep &&
1258             (!_ure_isspecial(*sp) || *sp == '(')) {
1259             _ure_push(state, b);
1260             _ure_push(_URE_AND, b);
1261         }
1262     }
1263     while ((top = _ure_peek(b)) == _URE_AND || top == _URE_OR)
1264       /*
1265        * Make an expression with the AND or OR operator and its right
1266        * hand side.
1267        */
1268       state = _ure_make_expr(_ure_pop(b), _ure_pop(b), state, b);
1269 
1270     if (b->stack.slist_used > 0)
1271       b->error = _URE_UNBALANCED_GROUP;
1272 
1273     return (b->error == _URE_OK) ? state : _URE_NOOP;
1274 }
1275 
1276 static void
_ure_add_symstate(ucs2_t sym,ucs2_t state,_ure_buffer_t * b)1277 _ure_add_symstate(ucs2_t sym, ucs2_t state, _ure_buffer_t *b)
1278 {
1279     ucs2_t i, *stp;
1280     _ure_symtab_t *sp;
1281 
1282     /*
1283      * Locate the symbol in the symbol table so the state can be added.
1284      * If the symbol doesn't exist, then a real problem exists.
1285      */
1286     for (i = 0, sp = b->symtab; i < b->symtab_used && sym != sp->id;
1287          i++, sp++) ;
1288 
1289     /*
1290      * Now find out if the state exists in the symbol's state list.
1291      */
1292     for (i = 0, stp = sp->states.slist;
1293          i < sp->states.slist_used && state > *stp; i++, stp++) ;
1294 
1295     if (i == sp->states.slist_used || state < *stp) {
1296         /*
1297          * Need to add the state in order.
1298          */
1299         if (sp->states.slist_used == sp->states.slist_size) {
1300             if (sp->states.slist_size == 0)
1301               sp->states.slist = (ucs2_t *) malloc(sizeof(ucs2_t) << 3);
1302             else
1303               sp->states.slist = (ucs2_t *)
1304                   realloc((char *) sp->states.slist,
1305                           sizeof(ucs2_t) * (sp->states.slist_size + 8));
1306             sp->states.slist_size += 8;
1307         }
1308         if (i < sp->states.slist_used)
1309           (void) _ure_memmove((char *) (sp->states.slist + i + 1),
1310                               (char *) (sp->states.slist + i),
1311                               sizeof(ucs2_t) * (sp->states.slist_used - i));
1312         sp->states.slist[i] = state;
1313         sp->states.slist_used++;
1314     }
1315 }
1316 
1317 static ucs2_t
_ure_add_state(ucs2_t nstates,ucs2_t * states,_ure_buffer_t * b)1318 _ure_add_state(ucs2_t nstates, ucs2_t *states, _ure_buffer_t *b)
1319 {
1320     ucs2_t i;
1321     _ure_state_t *sp;
1322 
1323     for (i = 0, sp = b->states.states; i < b->states.states_used; i++, sp++) {
1324         if (sp->st.slist_used == nstates &&
1325             memcmp((char *) states, (char *) sp->st.slist,
1326                    sizeof(ucs2_t) * nstates) == 0)
1327           break;
1328     }
1329 
1330     if (i == b->states.states_used) {
1331         /*
1332          * Need to add a new DFA state (set of NFA states).
1333          */
1334         if (b->states.states_used == b->states.states_size) {
1335             if (b->states.states_size == 0)
1336               b->states.states = (_ure_state_t *)
1337                   malloc(sizeof(_ure_state_t) << 3);
1338             else
1339               b->states.states = (_ure_state_t *)
1340                   realloc((char *) b->states.states,
1341                           sizeof(_ure_state_t) * (b->states.states_size + 8));
1342             sp = b->states.states + b->states.states_size;
1343             (void) memset((char *) sp, '\0', sizeof(_ure_state_t) << 3);
1344             b->states.states_size += 8;
1345         }
1346 
1347         sp = b->states.states + b->states.states_used++;
1348         sp->id = i;
1349 
1350         if (sp->st.slist_used + nstates > sp->st.slist_size) {
1351             if (sp->st.slist_size == 0)
1352               sp->st.slist = (ucs2_t *)
1353                   malloc(sizeof(ucs2_t) * (sp->st.slist_used + nstates));
1354             else
1355               sp->st.slist = (ucs2_t *)
1356                   realloc((char *) sp->st.slist,
1357                           sizeof(ucs2_t) * (sp->st.slist_used + nstates));
1358             sp->st.slist_size = sp->st.slist_used + nstates;
1359         }
1360         sp->st.slist_used = nstates;
1361         (void) AC_MEMCPY((char *) sp->st.slist, (char *) states,
1362                       sizeof(ucs2_t) * nstates);
1363     }
1364 
1365     /*
1366      * Return the ID of the DFA state representing a group of NFA states.
1367      */
1368     return i;
1369 }
1370 
1371 static void
_ure_reduce(ucs2_t start,_ure_buffer_t * b)1372 _ure_reduce(ucs2_t start, _ure_buffer_t *b)
1373 {
1374     ucs2_t i, j, state, eval, syms, rhs;
1375     ucs2_t s1, s2, ns1, ns2;
1376     _ure_state_t *sp;
1377     _ure_symtab_t *smp;
1378 
1379     b->reducing = 1;
1380 
1381     /*
1382      * Add the starting state for the reduction.
1383      */
1384     _ure_add_state(1, &start, b);
1385 
1386     /*
1387      * Process each set of NFA states that get created.
1388      */
1389     for (i = 0; i < b->states.states_used; i++) {
1390         sp = b->states.states + i;
1391 
1392         /*
1393          * Push the current states on the stack.
1394          */
1395         for (j = 0; j < sp->st.slist_used; j++)
1396           _ure_push(sp->st.slist[j], b);
1397 
1398         /*
1399          * Reduce the NFA states.
1400          */
1401         for (j = sp->accepting = syms = 0; j < b->stack.slist_used; j++) {
1402             state = b->stack.slist[j];
1403             eval = 1;
1404 
1405             /*
1406              * This inner loop is the iterative equivalent of recursively
1407              * reducing subexpressions generated as a result of a reduction.
1408              */
1409             while (eval) {
1410                 switch (b->expr[state].type) {
1411                   case _URE_SYMBOL:
1412                     ns1 = _ure_make_expr(_URE_ONE, _URE_NOOP, _URE_NOOP, b);
1413                     _ure_add_symstate(b->expr[state].lhs, ns1, b);
1414                     syms++;
1415                     eval = 0;
1416                     break;
1417                   case _URE_ONE:
1418                     sp->accepting = 1;
1419                     eval = 0;
1420                     break;
1421                   case _URE_QUEST:
1422                     s1 = b->expr[state].lhs;
1423                     ns1 = _ure_make_expr(_URE_ONE, _URE_NOOP, _URE_NOOP, b);
1424                     state = _ure_make_expr(_URE_OR, ns1, s1, b);
1425                     break;
1426                   case _URE_PLUS:
1427                     s1 = b->expr[state].lhs;
1428                     ns1 = _ure_make_expr(_URE_STAR, s1, _URE_NOOP, b);
1429                     state = _ure_make_expr(_URE_AND, s1, ns1, b);
1430                     break;
1431                   case _URE_STAR:
1432                     s1 = b->expr[state].lhs;
1433                     ns1 = _ure_make_expr(_URE_ONE, _URE_NOOP, _URE_NOOP, b);
1434                     ns2 = _ure_make_expr(_URE_PLUS, s1, _URE_NOOP, b);
1435                     state = _ure_make_expr(_URE_OR, ns1, ns2, b);
1436                     break;
1437                   case _URE_OR:
1438                     s1 = b->expr[state].lhs;
1439                     s2 = b->expr[state].rhs;
1440                     _ure_push(s1, b);
1441                     _ure_push(s2, b);
1442                     eval = 0;
1443                     break;
1444                   case _URE_AND:
1445                     s1 = b->expr[state].lhs;
1446                     s2 = b->expr[state].rhs;
1447                     switch (b->expr[s1].type) {
1448                       case _URE_SYMBOL:
1449                         _ure_add_symstate(b->expr[s1].lhs, s2, b);
1450                         syms++;
1451                         eval = 0;
1452                         break;
1453                       case _URE_ONE:
1454                         state = s2;
1455                         break;
1456                       case _URE_QUEST:
1457                         ns1 = b->expr[s1].lhs;
1458                         ns2 = _ure_make_expr(_URE_AND, ns1, s2, b);
1459                         state = _ure_make_expr(_URE_OR, s2, ns2, b);
1460                         break;
1461                       case _URE_PLUS:
1462                         ns1 = b->expr[s1].lhs;
1463                         ns2 = _ure_make_expr(_URE_OR, s2, state, b);
1464                         state = _ure_make_expr(_URE_AND, ns1, ns2, b);
1465                         break;
1466                       case _URE_STAR:
1467                         ns1 = b->expr[s1].lhs;
1468                         ns2 = _ure_make_expr(_URE_AND, ns1, state, b);
1469                         state = _ure_make_expr(_URE_OR, s2, ns2, b);
1470                         break;
1471                       case _URE_OR:
1472                         ns1 = b->expr[s1].lhs;
1473                         ns2 = b->expr[s1].rhs;
1474                         ns1 = _ure_make_expr(_URE_AND, ns1, s2, b);
1475                         ns2 = _ure_make_expr(_URE_AND, ns2, s2, b);
1476                         state = _ure_make_expr(_URE_OR, ns1, ns2, b);
1477                         break;
1478                       case _URE_AND:
1479                         ns1 = b->expr[s1].lhs;
1480                         ns2 = b->expr[s1].rhs;
1481                         ns2 = _ure_make_expr(_URE_AND, ns2, s2, b);
1482                         state = _ure_make_expr(_URE_AND, ns1, ns2, b);
1483                         break;
1484                     }
1485                 }
1486             }
1487         }
1488 
1489         /*
1490          * Clear the state stack.
1491          */
1492         while (_ure_pop(b) != _URE_NOOP) ;
1493 
1494         /*
1495          * Reset the state pointer because the reduction may have moved it
1496          * during a reallocation.
1497          */
1498         sp = b->states.states + i;
1499 
1500         /*
1501          * Generate the DFA states for the symbols collected during the
1502          * current reduction.
1503          */
1504         if (sp->trans_used + syms > sp->trans_size) {
1505             if (sp->trans_size == 0)
1506               sp->trans = (_ure_elt_t *)
1507                   malloc(sizeof(_ure_elt_t) * (sp->trans_used + syms));
1508             else
1509               sp->trans = (_ure_elt_t *)
1510                   realloc((char *) sp->trans,
1511                           sizeof(_ure_elt_t) * (sp->trans_used + syms));
1512             sp->trans_size = sp->trans_used + syms;
1513         }
1514 
1515         /*
1516          * Go through the symbol table and generate the DFA state transitions
1517          * for each symbol that has collected NFA states.
1518          */
1519         for (j = syms = 0, smp = b->symtab; j < b->symtab_used; j++, smp++) {
1520             sp = b->states.states + i;
1521 
1522             if (smp->states.slist_used > 0) {
1523                 sp->trans[syms].lhs = smp->id;
1524                 rhs = _ure_add_state(smp->states.slist_used,
1525                                      smp->states.slist, b);
1526                 /*
1527                  * Reset the state pointer in case the reallocation moves it
1528                  * in memory.
1529                  */
1530                 sp = b->states.states + i;
1531                 sp->trans[syms].rhs = rhs;
1532 
1533                 smp->states.slist_used = 0;
1534                 syms++;
1535             }
1536         }
1537 
1538         /*
1539          * Set the number of transitions actually used.
1540          */
1541         sp->trans_used = syms;
1542     }
1543     b->reducing = 0;
1544 }
1545 
1546 static void
_ure_add_equiv(ucs2_t l,ucs2_t r,_ure_buffer_t * b)1547 _ure_add_equiv(ucs2_t l, ucs2_t r, _ure_buffer_t *b)
1548 {
1549     ucs2_t tmp;
1550 
1551     l = b->states.states[l].id;
1552     r = b->states.states[r].id;
1553 
1554     if (l == r)
1555       return;
1556 
1557     if (l > r) {
1558         tmp = l;
1559         l = r;
1560         r = tmp;
1561     }
1562 
1563     /*
1564      * Check to see if the equivalence pair already exists.
1565      */
1566     for (tmp = 0; tmp < b->equiv_used &&
1567              (b->equiv[tmp].l != l || b->equiv[tmp].r != r);
1568          tmp++) ;
1569 
1570     if (tmp < b->equiv_used)
1571       return;
1572 
1573     if (b->equiv_used == b->equiv_size) {
1574         if (b->equiv_size == 0)
1575           b->equiv = (_ure_equiv_t *) malloc(sizeof(_ure_equiv_t) << 3);
1576         else
1577           b->equiv = (_ure_equiv_t *) realloc((char *) b->equiv,
1578                                               sizeof(_ure_equiv_t) *
1579                                               (b->equiv_size + 8));
1580         b->equiv_size += 8;
1581     }
1582     b->equiv[b->equiv_used].l = l;
1583     b->equiv[b->equiv_used].r = r;
1584     b->equiv_used++;
1585 }
1586 
1587 /*
1588  * Merge the DFA states that are equivalent.
1589  */
1590 static void
_ure_merge_equiv(_ure_buffer_t * b)1591 _ure_merge_equiv(_ure_buffer_t *b)
1592 {
1593     ucs2_t i, j, k, eq, done;
1594     _ure_state_t *sp1, *sp2, *ls, *rs;
1595 
1596     for (i = 0; i < b->states.states_used; i++) {
1597         sp1 = b->states.states + i;
1598         if (sp1->id != i)
1599           continue;
1600         for (j = 0; j < i; j++) {
1601             sp2 = b->states.states + j;
1602             if (sp2->id != j)
1603               continue;
1604             b->equiv_used = 0;
1605             _ure_add_equiv(i, j, b);
1606             for (eq = 0, done = 0; eq < b->equiv_used; eq++) {
1607                 ls = b->states.states + b->equiv[eq].l;
1608                 rs = b->states.states + b->equiv[eq].r;
1609                 if (ls->accepting != rs->accepting ||
1610                     ls->trans_used != rs->trans_used) {
1611                     done = 1;
1612                     break;
1613                 }
1614                 for (k = 0; k < ls->trans_used &&
1615                          ls->trans[k].lhs == rs->trans[k].lhs; k++) ;
1616                 if (k < ls->trans_used) {
1617                     done = 1;
1618                     break;
1619                 }
1620 
1621                 for (k = 0; k < ls->trans_used; k++)
1622                   _ure_add_equiv(ls->trans[k].rhs, rs->trans[k].rhs, b);
1623             }
1624             if (done == 0)
1625               break;
1626         }
1627         for (eq = 0; j < i && eq < b->equiv_used; eq++)
1628           b->states.states[b->equiv[eq].r].id =
1629               b->states.states[b->equiv[eq].l].id;
1630     }
1631 
1632     /*
1633      * Renumber the states appropriately.
1634      */
1635     for (i = eq = 0, sp1 = b->states.states; i < b->states.states_used;
1636          sp1++, i++)
1637       sp1->id = (sp1->id == i) ? eq++ : b->states.states[sp1->id].id;
1638 }
1639 
1640 /*************************************************************************
1641  *
1642  * API.
1643  *
1644  *************************************************************************/
1645 
1646 ure_buffer_t
ure_buffer_create(void)1647 ure_buffer_create(void)
1648 {
1649     ure_buffer_t b;
1650 
1651     b = (ure_buffer_t) calloc(1, sizeof(_ure_buffer_t));
1652 
1653     return b;
1654 }
1655 
1656 void
ure_buffer_free(ure_buffer_t buf)1657 ure_buffer_free(ure_buffer_t buf)
1658 {
1659     unsigned long i;
1660 
1661     if (buf == 0)
1662       return;
1663 
1664     if (buf->stack.slist_size > 0)
1665       free((char *) buf->stack.slist);
1666 
1667     if (buf->expr_size > 0)
1668       free((char *) buf->expr);
1669 
1670     for (i = 0; i < buf->symtab_size; i++) {
1671         if (buf->symtab[i].states.slist_size > 0)
1672           free((char *) buf->symtab[i].states.slist);
1673     }
1674 
1675     if (buf->symtab_size > 0)
1676       free((char *) buf->symtab);
1677 
1678     for (i = 0; i < buf->states.states_size; i++) {
1679         if (buf->states.states[i].trans_size > 0)
1680           free((char *) buf->states.states[i].trans);
1681         if (buf->states.states[i].st.slist_size > 0)
1682           free((char *) buf->states.states[i].st.slist);
1683     }
1684 
1685     if (buf->states.states_size > 0)
1686       free((char *) buf->states.states);
1687 
1688     if (buf->equiv_size > 0)
1689       free((char *) buf->equiv);
1690 
1691     free((char *) buf);
1692 }
1693 
1694 ure_dfa_t
ure_compile(ucs2_t * re,unsigned long relen,int casefold,ure_buffer_t buf)1695 ure_compile(ucs2_t *re, unsigned long relen, int casefold, ure_buffer_t buf)
1696 {
1697     ucs2_t i, j, state;
1698     _ure_state_t *sp;
1699     _ure_dstate_t *dsp;
1700     _ure_trans_t *tp;
1701     ure_dfa_t dfa;
1702 
1703     if (re == 0 || *re == 0 || relen == 0 || buf == 0)
1704       return 0;
1705 
1706     /*
1707      * Reset the various fields of the compilation buffer.  Default the flags
1708      * to indicate the presence of the "^$" pattern.  If any other pattern
1709      * occurs, then this flag will be removed.  This is done to catch this
1710      * special pattern and handle it specially when matching.
1711      */
1712     buf->flags = _URE_DFA_BLANKLINE | ((casefold) ? _URE_DFA_CASEFOLD : 0);
1713     buf->reducing = 0;
1714     buf->stack.slist_used = 0;
1715     buf->expr_used = 0;
1716 
1717     for (i = 0; i < buf->symtab_used; i++)
1718       buf->symtab[i].states.slist_used = 0;
1719     buf->symtab_used = 0;
1720 
1721     for (i = 0; i < buf->states.states_used; i++) {
1722         buf->states.states[i].st.slist_used = 0;
1723         buf->states.states[i].trans_used = 0;
1724     }
1725     buf->states.states_used = 0;
1726 
1727     /*
1728      * Construct the NFA.  If this stage returns a 0, then an error occurred or
1729      * an empty expression was passed.
1730      */
1731     if ((state = _ure_re2nfa(re, relen, buf)) == _URE_NOOP)
1732       return 0;
1733 
1734     /*
1735      * Do the expression reduction to get the initial DFA.
1736      */
1737     _ure_reduce(state, buf);
1738 
1739     /*
1740      * Merge all the equivalent DFA states.
1741      */
1742     _ure_merge_equiv(buf);
1743 
1744     /*
1745      * Construct the minimal DFA.
1746      */
1747     dfa = (ure_dfa_t) malloc(sizeof(_ure_dfa_t));
1748     (void) memset((char *) dfa, '\0', sizeof(_ure_dfa_t));
1749 
1750     dfa->flags = buf->flags & (_URE_DFA_CASEFOLD|_URE_DFA_BLANKLINE);
1751 
1752     /*
1753      * Free up the NFA state groups and transfer the symbols from the buffer
1754      * to the DFA.
1755      */
1756     for (i = 0; i < buf->symtab_size; i++) {
1757         if (buf->symtab[i].states.slist_size > 0)
1758           free((char *) buf->symtab[i].states.slist);
1759     }
1760     dfa->syms = buf->symtab;
1761     dfa->nsyms = buf->symtab_used;
1762 
1763     buf->symtab_used = buf->symtab_size = 0;
1764 
1765     /*
1766      * Collect the total number of states and transitions needed for the DFA.
1767      */
1768     for (i = state = 0, sp = buf->states.states; i < buf->states.states_used;
1769          i++, sp++) {
1770         if (sp->id == state) {
1771             dfa->nstates++;
1772             dfa->ntrans += sp->trans_used;
1773             state++;
1774         }
1775     }
1776 
1777     /*
1778      * Allocate enough space for the states and transitions.
1779      */
1780     dfa->states = (_ure_dstate_t *) malloc(sizeof(_ure_dstate_t) *
1781                                            dfa->nstates);
1782     dfa->trans = (_ure_trans_t *) malloc(sizeof(_ure_trans_t) * dfa->ntrans);
1783 
1784     /*
1785      * Actually transfer the DFA states from the buffer.
1786      */
1787     dsp = dfa->states;
1788     tp = dfa->trans;
1789     for (i = state = 0, sp = buf->states.states; i < buf->states.states_used;
1790          i++, sp++) {
1791         if (sp->id == state) {
1792             dsp->trans = tp;
1793             dsp->ntrans = sp->trans_used;
1794             dsp->accepting = sp->accepting;
1795 
1796             /*
1797              * Add the transitions for the state.
1798              */
1799             for (j = 0; j < dsp->ntrans; j++, tp++) {
1800                 tp->symbol = sp->trans[j].lhs;
1801                 tp->next_state = buf->states.states[sp->trans[j].rhs].id;
1802             }
1803 
1804             dsp++;
1805             state++;
1806         }
1807     }
1808 
1809     return dfa;
1810 }
1811 
1812 void
ure_dfa_free(ure_dfa_t dfa)1813 ure_dfa_free(ure_dfa_t dfa)
1814 {
1815     ucs2_t i;
1816 
1817     if (dfa == 0)
1818       return;
1819 
1820     for (i = 0; i < dfa->nsyms; i++) {
1821         if ((dfa->syms[i].type == _URE_CCLASS ||
1822              dfa->syms[i].type == _URE_NCCLASS) &&
1823             dfa->syms[i].sym.ccl.ranges_size > 0)
1824           free((char *) dfa->syms[i].sym.ccl.ranges);
1825     }
1826     if (dfa->nsyms > 0)
1827       free((char *) dfa->syms);
1828 
1829     if (dfa->nstates > 0)
1830       free((char *) dfa->states);
1831     if (dfa->ntrans > 0)
1832       free((char *) dfa->trans);
1833     free((char *) dfa);
1834 }
1835 
1836 void
ure_write_dfa(ure_dfa_t dfa,FILE * out)1837 ure_write_dfa(ure_dfa_t dfa, FILE *out)
1838 {
1839     ucs2_t i, j, k, h, l;
1840     _ure_dstate_t *sp;
1841     _ure_symtab_t *sym;
1842     _ure_range_t *rp;
1843 
1844     if (dfa == 0 || out == 0)
1845       return;
1846 
1847     /*
1848      * Write all the different character classes.
1849      */
1850     for (i = 0, sym = dfa->syms; i < dfa->nsyms; i++, sym++) {
1851         if (sym->type == _URE_CCLASS || sym->type == _URE_NCCLASS) {
1852             fprintf(out, "C%hd = ", sym->id);
1853             if (sym->sym.ccl.ranges_used > 0) {
1854                 putc('[', out);
1855                 if (sym->type == _URE_NCCLASS)
1856                   putc('^', out);
1857             }
1858             if (sym->props != 0) {
1859                 if (sym->type == _URE_NCCLASS)
1860                   fprintf(out, "\\P");
1861                 else
1862                   fprintf(out, "\\p");
1863                 for (k = h = 0; k < 32; k++) {
1864                     if (sym->props & (1 << k)) {
1865                         if (h != 0)
1866                           putc(',', out);
1867                         fprintf(out, "%d", k + 1);
1868                         h = 1;
1869                     }
1870                 }
1871             }
1872             /*
1873              * Dump the ranges.
1874              */
1875             for (k = 0, rp = sym->sym.ccl.ranges;
1876                  k < sym->sym.ccl.ranges_used; k++, rp++) {
1877                 /*
1878                  * Check for UTF16 characters.
1879                  */
1880                 if (0x10000 <= rp->min_code &&
1881                     rp->min_code <= 0x10ffff) {
1882                     h = (ucs2_t) (((rp->min_code - 0x10000) >> 10) + 0xd800);
1883                     l = (ucs2_t) (((rp->min_code - 0x10000) & 1023) + 0xdc00);
1884                     fprintf(out, "\\x%04hX\\x%04hX", h, l);
1885                 } else
1886                   fprintf(out, "\\x%04lX", rp->min_code & 0xffff);
1887                 if (rp->max_code != rp->min_code) {
1888                     putc('-', out);
1889                     if (rp->max_code >= 0x10000 &&
1890                         rp->max_code <= 0x10ffff) {
1891                         h = (ucs2_t) (((rp->max_code - 0x10000) >> 10) + 0xd800);
1892                         l = (ucs2_t) (((rp->max_code - 0x10000) & 1023) + 0xdc00);
1893                         fprintf(out, "\\x%04hX\\x%04hX", h, l);
1894                     } else
1895                       fprintf(out, "\\x%04lX", rp->max_code & 0xffff);
1896                 }
1897             }
1898             if (sym->sym.ccl.ranges_used > 0)
1899               putc(']', out);
1900             putc('\n', out);
1901         }
1902     }
1903 
1904     for (i = 0, sp = dfa->states; i < dfa->nstates; i++, sp++) {
1905         fprintf(out, "S%hd = ", i);
1906         if (sp->accepting) {
1907             fprintf(out, "1 ");
1908             if (sp->ntrans)
1909               fprintf(out, "| ");
1910         }
1911         for (j = 0; j < sp->ntrans; j++) {
1912             if (j > 0)
1913               fprintf(out, "| ");
1914 
1915             sym = dfa->syms + sp->trans[j].symbol;
1916             switch (sym->type) {
1917               case _URE_CHAR:
1918                 if (0x10000 <= sym->sym.chr && sym->sym.chr <= 0x10ffff) {
1919                     /*
1920                      * Take care of UTF16 characters.
1921                      */
1922                     h = (ucs2_t) (((sym->sym.chr - 0x10000) >> 10) + 0xd800);
1923                     l = (ucs2_t) (((sym->sym.chr - 0x10000) & 1023) + 0xdc00);
1924                     fprintf(out, "\\x%04hX\\x%04hX ", h, l);
1925                 } else
1926                   fprintf(out, "\\x%04lX ", sym->sym.chr & 0xffff);
1927                 break;
1928               case _URE_ANY_CHAR:
1929                 fprintf(out, "<any> ");
1930                 break;
1931               case _URE_BOL_ANCHOR:
1932                 fprintf(out, "<bol-anchor> ");
1933                 break;
1934               case _URE_EOL_ANCHOR:
1935                 fprintf(out, "<eol-anchor> ");
1936                 break;
1937               case _URE_CCLASS:
1938               case _URE_NCCLASS:
1939                 fprintf(out, "[C%hd] ", sym->id);
1940                 break;
1941             }
1942             fprintf(out, "S%hd", sp->trans[j].next_state);
1943             if (j + 1 < sp->ntrans)
1944               putc(' ', out);
1945         }
1946         putc('\n', out);
1947     }
1948 }
1949 
1950 #define _ure_issep(cc) ((cc) == '\n' || (cc) == '\r' || (cc) == 0x2028 ||\
1951                         (cc) == 0x2029)
1952 
1953 int
ure_exec(ure_dfa_t dfa,int flags,ucs2_t * text,unsigned long textlen,unsigned long * match_start,unsigned long * match_end)1954 ure_exec(ure_dfa_t dfa, int flags, ucs2_t *text, unsigned long textlen,
1955          unsigned long *match_start, unsigned long *match_end)
1956 {
1957     int i, j, matched, found, skip;
1958     unsigned long ms, me;
1959     ucs4_t c;
1960     ucs2_t *sp, *ep, *lp;
1961     _ure_dstate_t *stp;
1962     _ure_symtab_t *sym;
1963     _ure_range_t *rp;
1964 
1965     if (dfa == 0 || text == 0)
1966       return 0;
1967 
1968     /*
1969      * Handle the special case of an empty string matching the "^$" pattern.
1970      */
1971     if (textlen == 0 && (dfa->flags & _URE_DFA_BLANKLINE)) {
1972         *match_start = *match_end = 0;
1973         return 1;
1974     }
1975 
1976     sp = text;
1977     ep = sp + textlen;
1978 
1979     ms = me = ~0;
1980 
1981     stp = dfa->states;
1982 
1983     for (found = skip = 0; found == 0 && sp < ep; ) {
1984         lp = sp;
1985         c = *sp++;
1986 
1987         /*
1988          * Check to see if this is a high surrogate that should be
1989          * combined with a following low surrogate.
1990          */
1991         if (sp < ep && 0xd800 <= c && c <= 0xdbff &&
1992             0xdc00 <= *sp && *sp <= 0xdfff)
1993           c = 0x10000 + (((c & 0x03ff) << 10) | (*sp++ & 0x03ff));
1994 
1995         /*
1996          * Determine if the character is non-spacing and should be skipped.
1997          */
1998         if (_ure_matches_properties(_URE_NONSPACING, c) &&
1999             (flags & URE_IGNORE_NONSPACING)) {
2000             sp++;
2001             continue;
2002         }
2003 
2004         if (dfa->flags & _URE_DFA_CASEFOLD)
2005           c = _ure_tolower(c);
2006 
2007         /*
2008          * See if one of the transitions matches.
2009          */
2010         for (i = 0, matched = 0; matched == 0 && i < stp->ntrans; i++) {
2011             sym = dfa->syms + stp->trans[i].symbol;
2012             switch (sym->type) {
2013               case _URE_ANY_CHAR:
2014                 if ((flags & URE_DOT_MATCHES_SEPARATORS) ||
2015                     !_ure_issep(c))
2016                   matched = 1;
2017                 break;
2018               case _URE_CHAR:
2019                 if (c == sym->sym.chr)
2020                   matched = 1;
2021                 break;
2022               case _URE_BOL_ANCHOR:
2023                 if (lp == text) {
2024                     sp = lp;
2025                     matched = 1;
2026                 } else if (_ure_issep(c)) {
2027                     if (c == '\r' && sp < ep && *sp == '\n')
2028                       sp++;
2029                     lp = sp;
2030                     matched = 1;
2031                 }
2032                 break;
2033               case _URE_EOL_ANCHOR:
2034                 if (_ure_issep(c)) {
2035                     /*
2036                      * Put the pointer back before the separator so the match
2037                      * end position will be correct.  This case will also
2038                      * cause the `sp' pointer to be advanced over the current
2039                      * separator once the match end point has been recorded.
2040                      */
2041                     sp = lp;
2042                     matched = 1;
2043                 }
2044                 break;
2045               case _URE_CCLASS:
2046               case _URE_NCCLASS:
2047                 if (sym->props != 0)
2048                   matched = _ure_matches_properties(sym->props, c);
2049                 for (j = 0, rp = sym->sym.ccl.ranges;
2050                      j < sym->sym.ccl.ranges_used; j++, rp++) {
2051                     if (rp->min_code <= c && c <= rp->max_code)
2052                       matched = 1;
2053                 }
2054                 if (sym->type == _URE_NCCLASS)
2055                   matched = !matched;
2056                 break;
2057             }
2058 
2059             if (matched) {
2060                 if (ms == ~0UL)
2061                   ms = lp - text;
2062                 else
2063                   me = sp - text;
2064                 stp = dfa->states + stp->trans[i].next_state;
2065 
2066                 /*
2067                  * If the match was an EOL anchor, adjust the pointer past the
2068                  * separator that caused the match.  The correct match
2069                  * position has been recorded already.
2070                  */
2071                 if (sym->type == _URE_EOL_ANCHOR) {
2072                     /*
2073                      * Skip the character that caused the match.
2074                      */
2075                     sp++;
2076 
2077                     /*
2078                      * Handle the infamous CRLF situation.
2079                      */
2080                     if (sp < ep && c == '\r' && *sp == '\n')
2081                       sp++;
2082                 }
2083             }
2084         }
2085 
2086         if (matched == 0) {
2087             if (stp->accepting == 0) {
2088                 /*
2089                  * If the last state was not accepting, then reset
2090                  * and start over.
2091                  */
2092                 stp = dfa->states;
2093                 ms = me = ~0;
2094             } else
2095               /*
2096                * The last state was accepting, so terminate the matching
2097                * loop to avoid more work.
2098                */
2099               found = 1;
2100         } else if (sp == ep) {
2101             if (!stp->accepting) {
2102                 /*
2103                  * This ugly hack is to make sure the end-of-line anchors
2104                  * match when the source text hits the end.  This is only done
2105                  * if the last subexpression matches.
2106                  */
2107                 for (i = 0; found == 0 && i < stp->ntrans; i++) {
2108                     sym = dfa->syms + stp->trans[i].symbol;
2109                     if (sym->type ==_URE_EOL_ANCHOR) {
2110                         stp = dfa->states + stp->trans[i].next_state;
2111                         if (stp->accepting) {
2112                             me = sp - text;
2113                             found = 1;
2114                         } else
2115                           break;
2116                     }
2117                 }
2118             } else {
2119                 /*
2120                  * Make sure any conditions that match all the way to the end
2121                  * of the string match.
2122                  */
2123                 found = 1;
2124                 me = sp - text;
2125             }
2126         }
2127     }
2128 
2129     if (found == 0)
2130       ms = me = ~0;
2131 
2132     *match_start = ms;
2133     *match_end = me;
2134 
2135     return (ms != ~0UL) ? 1 : 0;
2136 }
2137