1 /*
2 * CDDL HEADER START
3 *
4 * The contents of this file are subject to the terms of the
5 * Common Development and Distribution License (the "License").
6 * You may not use this file except in compliance with the License.
7 *
8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9 * or http://www.opensolaris.org/os/licensing.
10 * See the License for the specific language governing permissions
11 * and limitations under the License.
12 *
13 * When distributing Covered Code, include this CDDL HEADER in each
14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15 * If applicable, add the following below this CDDL HEADER, with the
16 * fields enclosed by brackets "[]" replaced with your own identifying
17 * information: Portions Copyright [yyyy] [name of copyright owner]
18 *
19 * CDDL HEADER END
20 */
21
22 /*
23 * Copyright 2008 Sun Microsystems, Inc. All rights reserved.
24 * Use is subject to license terms.
25 */
26
27 /* Copyright (c) 1984, 1986, 1987, 1988, 1989 AT&T */
28 /* All Rights Reserved */
29
30 #pragma ident "%Z%%M% %I% %E% SMI"
31
32 /*
33 * IMPORTANT NOTE:
34 *
35 * regcmp() WORKS **ONLY** WITH THE ASCII AND THE Solaris EUC CHARACTER SETS.
36 * IT IS **NOT** CHARACTER SET INDEPENDENT.
37 *
38 */
39
40 #pragma weak _regcmp = regcmp
41
42 #include "lint.h"
43 #include "mtlib.h"
44 #include <limits.h>
45 #include <stdarg.h>
46 #include <stdlib.h>
47 #include <thread.h>
48 #include <wctype.h>
49 #include <widec.h>
50 #include <string.h>
51 #include "tsd.h"
52
53
54 /* CONSTANTS SHARED WITH regex() */
55
56 #include "regex.h"
57
58 /* PRIVATE CONSTANTS */
59
60 #define BACKSLASH '\\'
61 #define CIRCUMFLEX '^'
62 #define COMMA ','
63 #define DASH '-'
64 #define DOLLAR_SIGN '$'
65 #define DOT '.'
66 #define LEFT_CURLY_BRACE '{'
67 #define LEFT_PAREN '('
68 #define LEFT_SQUARE_BRACKET '['
69 #define PLUS '+'
70 #define RIGHT_CURLY_BRACE '}'
71 #define RIGHT_PAREN ')'
72 #define RIGHT_SQUARE_BRACKET ']'
73 #define SINGLE_BYTE_MASK 0xff
74 #define STRINGP_STACK_SIZE 50
75 #define STAR '*'
76
77 /* PRIVATE GLOBAL VARIABLES */
78
79 static char *compilep_stack[STRINGP_STACK_SIZE];
80 static char **compilep_stackp;
81 static mutex_t regcmp_lock = DEFAULTMUTEX;
82
83 /* DECLARATIONS OF PRIVATE FUNCTIONS */
84
85 static int add_char(char *compilep, wchar_t wchar);
86 static int add_single_char_expr(char *compilep, wchar_t wchar);
87
88 #define ERROR_EXIT(mutex_lockp, arg_listp, compile_startp) \
89 \
90 va_end(arg_listp); \
91 lmutex_unlock(mutex_lockp); \
92 if ((compile_startp) != (char *)0) \
93 free((void *)compile_startp); \
94 return ((char *)0)
95
96 static int get_count(int *countp, const char *regexp);
97 static int get_digit(const char *regexp);
98 static int get_wchar(wchar_t *wchar, const char *regexp);
99 static char *pop_compilep(void);
100 static char *push_compilep(char *compilep);
101 static boolean_t valid_range(wchar_t lower_char, wchar_t upper_char);
102
103
104 /* DEFINITIONS OF PUBLIC VARIABLES */
105
106 int __i_size;
107
108 /*
109 * define thread-specific storage for __i_size
110 *
111 */
112 int *
___i_size(void)113 ___i_size(void)
114 {
115 if (thr_main())
116 return (&__i_size);
117 return ((int *)tsdalloc(_T_REGCMP_ISIZE, sizeof (int), NULL));
118 }
119
120 #define __i_size (*(___i_size()))
121
122 /* DEFINITION OF regcmp() */
123
124 extern char *
regcmp(const char * regexp,...)125 regcmp(const char *regexp, ...)
126 {
127 va_list arg_listp;
128 size_t arg_strlen;
129 boolean_t can_repeat;
130 int char_size;
131 unsigned int class_length;
132 char *compilep;
133 char *compile_startp = (char *)0;
134 int count_length;
135 wchar_t current_char;
136 int expr_length;
137 int groupn;
138 unsigned int group_length;
139 unsigned int high_bits;
140 boolean_t dash_indicates_range;
141 unsigned int low_bits;
142 int max_count;
143 int min_count;
144 const char *next_argp;
145 wchar_t first_char_in_range;
146 char *regex_typep;
147 int return_arg_number;
148 int substringn;
149
150 if (___i_size() == (int *)0)
151 return ((char *)0);
152
153 /*
154 * When compiling a regular expression, regcmp() generates at most
155 * two extra single-byte characters for each character in the
156 * expression, so allocating three times the number of bytes in all
157 * the strings that comprise the regular expression will ensure that
158 * regcmp() won't overwrite the end of the allocated block when
159 * compiling the expression.
160 */
161
162 va_start(arg_listp, regexp);
163 next_argp = regexp;
164 arg_strlen = 0;
165 while (next_argp != (char *)0) {
166 arg_strlen += strlen(next_argp);
167 next_argp = va_arg(arg_listp, /* const */ char *);
168 }
169 va_end(arg_listp);
170
171 if (arg_strlen == 0)
172 return ((char *)0);
173 compile_startp = (char *)malloc(3 * arg_strlen);
174 if (compile_startp == (char *)0)
175 return ((char *)0);
176
177 lmutex_lock(®cmp_lock);
178 __i_size = 0;
179 compilep = compile_startp;
180 compilep_stackp = &compilep_stack[STRINGP_STACK_SIZE];
181
182 /* GET THE FIRST CHARACTER IN THE REGULAR EXPRESSION */
183 va_start(arg_listp, regexp);
184 next_argp = va_arg(arg_listp, /* const */ char *);
185 char_size = get_wchar(¤t_char, regexp);
186 if (char_size < 0) {
187 ERROR_EXIT(®cmp_lock, arg_listp, compile_startp);
188 } else if (char_size > 0) {
189 regexp += char_size;
190 } else /* (char_size == 0 ) */ {
191 regexp = next_argp;
192 next_argp = va_arg(arg_listp, /* const */ char *);
193 char_size = get_wchar(¤t_char, regexp);
194 if (char_size <= 0) {
195 ERROR_EXIT(®cmp_lock, arg_listp, compile_startp);
196 } else {
197 regexp += char_size;
198 }
199 }
200
201 /* FIND OUT IF THE EXPRESSION MUST START AT THE START OF A STRING */
202
203 if (current_char == CIRCUMFLEX) {
204 char_size = get_wchar(¤t_char, regexp);
205 if (char_size < 0) {
206 ERROR_EXIT(®cmp_lock, arg_listp, compile_startp);
207 } else if (char_size > 0) {
208 regexp += char_size;
209 *compilep = (unsigned char)START_OF_STRING_MARK;
210 compilep++;
211 } else if /* (char_size == 0) && */ (next_argp != (char *)0) {
212 regexp = next_argp;
213 next_argp = va_arg(arg_listp, /* const */ char *);
214 char_size = get_wchar(¤t_char, regexp);
215 if (char_size <= 0) {
216 ERROR_EXIT(®cmp_lock, arg_listp,
217 compile_startp);
218 } else {
219 regexp += char_size;
220 }
221 *compilep = (unsigned char)START_OF_STRING_MARK;
222 compilep++;
223 } else {
224 /* ((char_size==0) && (next_argp==(char *)0)) */
225 /*
226 * the regular expression is "^"
227 */
228 *compilep = (unsigned char)START_OF_STRING_MARK;
229 compilep++;
230 *compilep = (unsigned char)END_REGEX;
231 compilep++;
232 *compilep = '\0';
233 compilep++;
234 __i_size = (int)(compilep - compile_startp);
235 va_end(arg_listp);
236 lmutex_unlock(®cmp_lock);
237 return (compile_startp);
238 }
239 }
240
241 /* COMPILE THE REGULAR EXPRESSION */
242
243 groupn = 0;
244 substringn = 0;
245 can_repeat = B_FALSE;
246 for (;;) {
247
248 /*
249 * At the end of each iteration get the next character
250 * from the regular expression and increment regexp to
251 * point to the following character. Exit when all
252 * the characters in all the strings in the argument
253 * list have been read.
254 */
255
256 switch (current_char) {
257
258 /*
259 * No fall-through. Each case ends with either
260 * a break or an error exit. Each case starts
261 * with compilep addressing the next location to
262 * be written in the compiled regular expression,
263 * and with regexp addressing the next character
264 * to be read from the regular expression being
265 * compiled. Each case that doesn't return
266 * increments regexp to address the next character
267 * to be read from the regular expression and
268 * increments compilep to address the next
269 * location to be written in the compiled
270 * regular expression.
271 *
272 * NOTE: The comments for each case give the meaning
273 * of the regular expression compiled by the case
274 * and the character string written to the compiled
275 * regular expression by the case. Each single
276 * character
277 * written to the compiled regular expression is
278 * shown enclosed in angle brackets (<>). Each
279 * compiled regular expression begins with a marker
280 * character which is shown as a named constant
281 * (e.g. <ASCII_CHAR>). Character constants are
282 * shown enclosed in single quotes (e.g. <'$'>).
283 * All other single characters written to the
284 * compiled regular expression are shown as lower
285 * case variable names (e.g. <ascii_char> or
286 * <multibyte_char>). Multicharacter
287 * strings written to the compiled regular expression
288 * are shown as variable names followed by elipses
289 * (e.g. <regex...>).
290 */
291
292 case DOLLAR_SIGN:
293 /* end of string marker or simple dollar sign */
294 /* compiles to <END_OF_STRING_MARK> or */
295 /* <ASCII_CHAR><'$'> */
296
297 char_size = get_wchar(¤t_char, regexp);
298 if ((char_size == 0) && (next_argp == (char *)0)) {
299 can_repeat = B_FALSE;
300 *compilep = (unsigned char)END_OF_STRING_MARK;
301 compilep++;
302 } else {
303 can_repeat = B_TRUE;
304 *compilep = (unsigned char)ASCII_CHAR;
305 regex_typep = compilep;
306 compilep++;
307 *compilep = DOLLAR_SIGN;
308 compilep++;
309 }
310 break; /* end case DOLLAR_SIGN */
311
312 case DOT: /* any character */
313
314 /* compiles to <ANY_CHAR> */
315
316 can_repeat = B_TRUE;
317 *compilep = (unsigned char)ANY_CHAR;
318 regex_typep = compilep;
319 compilep++;
320
321 break; /* end case DOT */
322
323 case BACKSLASH: /* escaped character */
324
325 /*
326 * compiles to <ASCII_CHAR><ascii_char> or
327 * <MULTIBYTE_CHAR><multibyte_char>
328 */
329
330 char_size = get_wchar(¤t_char, regexp);
331 if (char_size <= 0) {
332 ERROR_EXIT(®cmp_lock, arg_listp,
333 compile_startp);
334 } else {
335 regexp += char_size;
336 can_repeat = B_TRUE;
337 expr_length = add_single_char_expr(
338 compilep, current_char);
339 regex_typep = compilep;
340 compilep += expr_length;
341 }
342 break; /* end case '\\' */
343
344 case LEFT_SQUARE_BRACKET:
345 /* start of a character class expression */
346
347 /*
348 * [^...c...] compiles to
349 * <NOT_IN_CLASS><class_length><...c...>
350 * [^...a-z...] compiles to
351 * <NOT_IN_CLASS><class_length><...a<THRU>z...>
352 * [...c...] compiles to
353 * <IN_CLASS><class_length><...c...>
354 * [...a-z...] compiles to
355 * <IN_CLASS><class_length><...a<THRU>z...>
356 *
357 * NOTE: <class_length> includes the
358 * <class_length> byte
359 */
360
361 can_repeat = B_TRUE;
362 regex_typep = compilep;
363
364 /* DETERMINE THE CLASS TYPE */
365
366 /*
367 * NOTE: This algorithm checks the value of the
368 * "multibyte"
369 * macro in <euc.h> (included in <widec.h> )
370 * to find out if regcmp()
371 * is compiling the regular expression in a
372 * multibyte locale.
373 */
374 char_size = get_wchar(¤t_char, regexp);
375 if (char_size <= 0) {
376 ERROR_EXIT(®cmp_lock, arg_listp,
377 compile_startp);
378 } else if (current_char == CIRCUMFLEX) {
379 regexp++;
380 char_size = get_wchar(¤t_char, regexp);
381 if (char_size <= 0) {
382 ERROR_EXIT(®cmp_lock,
383 arg_listp, compile_startp);
384 } else {
385 regexp += char_size;
386 if (!multibyte) {
387 *compilep = (unsigned char)
388 NOT_IN_ASCII_CHAR_CLASS;
389 } else {
390 *compilep = (unsigned char)
391 NOT_IN_MULTIBYTE_CHAR_CLASS;
392 }
393 /* leave space for <class_length> */
394 compilep += 2;
395 }
396 } else {
397 regexp += char_size;
398 if (!multibyte) {
399 *compilep = (unsigned char)
400 IN_ASCII_CHAR_CLASS;
401 } else {
402 *compilep = (unsigned char)
403 IN_MULTIBYTE_CHAR_CLASS;
404 }
405 /* leave space for <class_length> */
406 compilep += 2;
407 }
408
409 /* COMPILE THE CLASS */
410 /*
411 * check for a leading right square bracket,
412 * which is allowed
413 */
414
415 if (current_char == RIGHT_SQUARE_BRACKET) {
416 /*
417 * the leading RIGHT_SQUARE_BRACKET may
418 * be part of a character range
419 * expression like "[]-\]"
420 */
421 dash_indicates_range = B_TRUE;
422 first_char_in_range = current_char;
423 char_size = get_wchar(¤t_char, regexp);
424 if (char_size <= 0) {
425 ERROR_EXIT(®cmp_lock,
426 arg_listp, compile_startp);
427 } else {
428 regexp += char_size;
429 *compilep = RIGHT_SQUARE_BRACKET;
430 compilep++;
431 }
432 } else {
433 /*
434 * decode the character in the following
435 * while loop and decide then if it can
436 * be the first character
437 * in a character range expression
438 */
439 dash_indicates_range = B_FALSE;
440 }
441
442 while (current_char != RIGHT_SQUARE_BRACKET) {
443 if (current_char != DASH) {
444 /*
445 * if a DASH follows current_char,
446 * current_char, the DASH and the
447 * character that follows the DASH
448 * may form a character range
449 * expression
450 */
451 dash_indicates_range = B_TRUE;
452 first_char_in_range = current_char;
453 expr_length = add_char(
454 compilep, current_char);
455 compilep += expr_length;
456
457 } else if /* (current_char == DASH) && */
458 (dash_indicates_range == B_FALSE) {
459 /*
460 * current_char is a DASH, but
461 * either begins the entire
462 * character class or follows a
463 * character that's already
464 * part of a character range
465 * expression, so it simply
466 * represents the DASH character
467 * itself
468 */
469 *compilep = DASH;
470 compilep ++;
471 /*
472 * if another DASH follows this
473 * one, this DASH is part
474 * of a character range expression
475 * like "[--\]"
476 */
477 dash_indicates_range = B_TRUE;
478 first_char_in_range = current_char;
479
480 } else /* ((current_char == DASH && */
481 /* (dash_indicates_range == B_TRUE)) */ {
482 /*
483 * the DASH appears after a single
484 * character that isn't
485 * already part of a character
486 * range expression, so it
487 * and the characters preceding
488 * and following it can form a
489 * character range expression
490 * like "[a-z]"
491 */
492 char_size = get_wchar(
493 ¤t_char, regexp);
494 if (char_size <= 0) {
495 ERROR_EXIT(®cmp_lock,
496 arg_listp, compile_startp);
497
498 } else if (current_char ==
499 RIGHT_SQUARE_BRACKET) {
500 /*
501 * the preceding DASH is
502 * the last character in the
503 * class and represents the
504 * DASH character itself
505 */
506 *compilep = DASH;
507 compilep++;
508
509 } else if (valid_range(
510 first_char_in_range,
511 current_char) == B_FALSE) {
512
513 ERROR_EXIT(®cmp_lock,
514 arg_listp, compile_startp);
515
516 } else {
517 /*
518 * the DASH is part of a
519 * character range
520 * expression; encode the
521 * rest of the expression
522 */
523 regexp += char_size;
524 *compilep = (unsigned char)
525 THRU;
526 compilep++;
527 expr_length = add_char(
528 compilep, current_char);
529 compilep += expr_length;
530 /*
531 * if a DASH follows this
532 * character range
533 * expression,
534 * it represents the DASH
535 * character itself
536 */
537 dash_indicates_range =
538 B_FALSE;
539 }
540 }
541
542 /* GET THE NEXT CHARACTER */
543
544 char_size = get_wchar(¤t_char, regexp);
545 if (char_size <= 0) {
546 ERROR_EXIT(®cmp_lock,
547 arg_listp, compile_startp);
548 } else {
549 regexp += char_size;
550 }
551
552 }
553 /* end while (current_char != RIGHT_SQUARE_BRACKET) */
554
555 /* INSERT THE LENGTH OF THE CLASS INTO THE */
556 /* COMPILED EXPRESSION */
557
558 class_length = (unsigned int)
559 (compilep - regex_typep - 1);
560 if ((class_length < 2) ||
561 (class_length > MAX_SINGLE_BYTE_INT)) {
562 ERROR_EXIT(®cmp_lock, arg_listp,
563 compile_startp);
564 } else {
565 *(regex_typep + 1) = (unsigned char)
566 class_length;
567 }
568 break; /* end case LEFT_SQUARE_BRACKET */
569
570 case LEFT_PAREN:
571
572 /*
573 * start of a parenthesized group of regular
574 * expressions compiles to <'\0'><'\0'>, leaving
575 * space in the compiled regular expression for
576 * <group_type|ADDED_LENGTH_BITS><group_length>
577 */
578
579 if (push_compilep(compilep) == (char *)0) {
580 /*
581 * groups can contain groups, so group
582 * start pointers
583 * must be saved and restored in sequence
584 */
585 ERROR_EXIT(®cmp_lock, arg_listp,
586 compile_startp);
587 } else {
588 can_repeat = B_FALSE;
589 *compilep = '\0'; /* for debugging */
590 compilep++;
591 *compilep = '\0'; /* for debugging */
592 compilep++;
593 }
594 break; /* end case LEFT_PAREN */
595
596 case RIGHT_PAREN:
597 /* end of a marked group of regular expressions */
598
599 /*
600 * (<regex>)$0-9 compiles to
601 * <SAVED_GROUP><substringn><compiled_regex...>\
602 * <END_SAVED_GROUP><substringn><return_arg_number>
603 * (<regex>)* compiles to
604 * <ZERO_OR_MORE_GROUP|ADDED_LENGTH_BITS>
605 * <group_length> <compiled_regex...>
606 * <END_GROUP|ZERO_OR_MORE><groupn>
607 * (<regex>)+ compiles to
608 * <ONE_OR_MORE_GROUP|ADDED_LENGTH_BITS>
609 * <group_length>\
610 * <compiled_regex...><END_GROUP|ONE_OR_MORE>
611 * <groupn>
612 * (<regex>){...} compiles to
613 * <COUNTED_GROUP|ADDED_LENGTH_BITS><group_length>\
614 * <compiled_regex...><END_GROUP|COUNT><groupn>\
615 * <minimum_repeat_count><maximum_repeat_count>
616 * otherwise (<regex>) compiles to
617 * <SIMPLE_GROUP><blank><compiled_regex...>
618 * <END_GROUP><groupn>
619 *
620 * NOTE:
621 *
622 * group_length + (256 * ADDED_LENGTH_BITS) ==
623 * length_of(<compiled_regex...><END_GROUP|...>
624 * <groupn>)
625 * which also ==
626 * length_of(<group_type|ADDED_LENGTH_BITS>
627 * <group_length>\ <compiled_regex...>)
628 * groupn no longer seems to be used, but the code
629 * still computes it to preserve backward
630 * compatibility
631 * with earlier versions of regex().
632 */
633
634 /* RETRIEVE THE ADDRESS OF THE START OF THE GROUP */
635
636 regex_typep = pop_compilep();
637 if (regex_typep == (char *)0) {
638 ERROR_EXIT(®cmp_lock, arg_listp,
639 compile_startp);
640 }
641 char_size = get_wchar(¤t_char, regexp);
642 if (char_size < 0) {
643 ERROR_EXIT(®cmp_lock, arg_listp,
644 compile_startp);
645 } else if (char_size == 0) {
646 *regex_typep = SIMPLE_GROUP;
647 can_repeat = B_TRUE;
648 *compilep = (unsigned char)END_GROUP;
649 regex_typep = compilep;
650 compilep++;
651 *compilep = (unsigned char)groupn;
652 groupn++;
653 compilep++;
654 } else if (current_char == DOLLAR_SIGN) {
655 *regex_typep = SAVED_GROUP;
656 regex_typep++;
657 *regex_typep = (char)substringn;
658 can_repeat = B_FALSE;
659 regexp ++;
660 return_arg_number = get_digit(regexp);
661 if ((return_arg_number < 0) ||
662 (substringn >= NSUBSTRINGS)) {
663 ERROR_EXIT(®cmp_lock, arg_listp,
664 compile_startp);
665 }
666 regexp++;
667 *compilep = (unsigned char)END_SAVED_GROUP;
668 compilep++;
669 *compilep = (unsigned char)substringn;
670 substringn++;
671 compilep++;
672 *compilep = (unsigned char)return_arg_number;
673 compilep++;
674 } else {
675 switch (current_char) {
676 case STAR:
677 *regex_typep = ZERO_OR_MORE_GROUP;
678 break;
679 case PLUS:
680 *regex_typep = ONE_OR_MORE_GROUP;
681 break;
682 case LEFT_CURLY_BRACE:
683 *regex_typep = COUNTED_GROUP;
684 break;
685 default:
686 *regex_typep = SIMPLE_GROUP;
687 }
688 if (*regex_typep != SIMPLE_GROUP) {
689 group_length = (unsigned int)
690 (compilep - regex_typep);
691 if (group_length >= 1024) {
692 ERROR_EXIT(®cmp_lock,
693 arg_listp, compile_startp);
694 }
695 high_bits = group_length >>
696 TIMES_256_SHIFT;
697 low_bits = group_length &
698 SINGLE_BYTE_MASK;
699 *regex_typep =
700 (unsigned char)
701 ((unsigned int)
702 *regex_typep | high_bits);
703 regex_typep++;
704 *regex_typep =
705 (unsigned char)low_bits;
706 }
707 can_repeat = B_TRUE;
708 *compilep = (unsigned char)END_GROUP;
709 regex_typep = compilep;
710 compilep++;
711 *compilep = (unsigned char)groupn;
712 groupn++;
713 compilep++;
714 }
715
716 break; /* end case RIGHT_PAREN */
717
718 case STAR: /* zero or more repetitions of the */
719 /* preceding expression */
720
721 /*
722 * <regex...>* compiles to <regex_type|ZERO_OR_MORE>\
723 * <compiled_regex...>
724 * (<regex...>)* compiles to
725 * <ZERO_OR_MORE_GROUP|ADDED_LENGTH_BITS>\
726 * <group_length><compiled_regex...>\
727 * <END_GROUP|ZERO_OR_MORE><groupn>
728 */
729
730 if (can_repeat == B_FALSE) {
731 ERROR_EXIT(®cmp_lock, arg_listp,
732 compile_startp);
733 } else {
734 can_repeat = B_FALSE;
735 *regex_typep = (unsigned char)
736 ((unsigned int)*regex_typep | ZERO_OR_MORE);
737 }
738 break; /* end case '*' */
739
740 case PLUS:
741 /* one or more repetitions of the preceding */
742 /* expression */
743
744 /*
745 * <regex...>+ compiles to <regex_type|ONE_OR_MORE>\
746 * <compiled_regex...> (<regex...>)+ compiles to
747 * <ONE_OR_MORE_GROUP|ADDED_LENGTH_BITS>\
748 * <group_length><compiled_regex...>\
749 * <END_GROUP|ONE_OR_MORE><groupn>
750 */
751
752 if (can_repeat == B_FALSE) {
753 ERROR_EXIT(®cmp_lock, arg_listp,
754 compile_startp);
755 } else {
756 can_repeat = B_FALSE;
757 *regex_typep =
758 (unsigned char)((unsigned int)*
759 regex_typep | ONE_OR_MORE);
760 }
761 break; /* end case '+' */
762
763 case LEFT_CURLY_BRACE:
764
765 /*
766 * repeat the preceding regular expression
767 * at least min_count times
768 * and at most max_count times
769 *
770 * <regex...>{min_count} compiles to
771 * <regex type|COUNT><compiled_regex...>
772 * <min_count><min_count>
773 *
774 * <regex...>{min_count,} compiles to
775 * <regex type|COUNT><compiled_regex...>
776 * <min_count><UNLIMITED>
777 *
778 * <regex...>{min_count,max_count} compiles to
779 * <regex type>|COUNT><compiled_regex...>
780 * <min_count><max_count>
781 *
782 * (<regex...>){min_count,max_count} compiles to
783 * <COUNTED_GROUP|ADDED_LENGTH_BITS><group_length>\
784 * <compiled_regex...><END_GROUP|COUNT><groupn>\
785 * <minimum_match_count><maximum_match_count>
786 */
787
788 if (can_repeat == B_FALSE) {
789 ERROR_EXIT(®cmp_lock, arg_listp,
790 compile_startp);
791 }
792 can_repeat = B_FALSE;
793 *regex_typep = (unsigned char)((unsigned int)*
794 regex_typep | COUNT);
795 count_length = get_count(&min_count, regexp);
796 if (count_length <= 0) {
797 ERROR_EXIT(®cmp_lock, arg_listp,
798 compile_startp);
799 }
800 regexp += count_length;
801
802 if (*regexp == RIGHT_CURLY_BRACE) { /* {min_count} */
803 regexp++;
804 max_count = min_count;
805 } else if (*regexp == COMMA) { /* {min_count,..} */
806 regexp++;
807 /* {min_count,} */
808 if (*regexp == RIGHT_CURLY_BRACE) {
809 regexp++;
810 max_count = UNLIMITED;
811 } else { /* {min_count,max_count} */
812 count_length = get_count(
813 &max_count, regexp);
814 if (count_length <= 0) {
815 ERROR_EXIT(®cmp_lock,
816 arg_listp, compile_startp);
817 }
818 regexp += count_length;
819 if (*regexp != RIGHT_CURLY_BRACE) {
820 ERROR_EXIT(®cmp_lock,
821 arg_listp, compile_startp);
822 }
823 regexp++;
824 }
825 } else { /* invalid expression */
826 ERROR_EXIT(®cmp_lock, arg_listp,
827 compile_startp);
828 }
829
830 if ((min_count > MAX_SINGLE_BYTE_INT) ||
831 ((max_count != UNLIMITED) &&
832 (min_count > max_count))) {
833 ERROR_EXIT(®cmp_lock, arg_listp,
834 compile_startp);
835 } else {
836 *compilep = (unsigned char)min_count;
837 compilep++;
838 *compilep = (unsigned char)max_count;
839 compilep++;
840 }
841 break; /* end case LEFT_CURLY_BRACE */
842
843 default: /* a single non-special character */
844
845 /*
846 * compiles to <ASCII_CHAR><ascii_char> or
847 * <MULTIBYTE_CHAR><multibyte_char>
848 */
849
850 can_repeat = B_TRUE;
851 regex_typep = compilep;
852 expr_length = add_single_char_expr(compilep,
853 current_char);
854 compilep += expr_length;
855
856 } /* end switch (current_char) */
857
858 /* GET THE NEXT CHARACTER FOR THE WHILE LOOP */
859
860 char_size = get_wchar(¤t_char, regexp);
861 if (char_size < 0) {
862 ERROR_EXIT(®cmp_lock, arg_listp, compile_startp);
863 } else if (char_size > 0) {
864 regexp += char_size;
865 } else if /* (char_size == 0) && */ (next_argp != (char *)0) {
866 regexp = next_argp;
867 next_argp = va_arg(arg_listp, /* const */ char *);
868 char_size = get_wchar(¤t_char, regexp);
869 if (char_size <= 0) {
870 ERROR_EXIT(®cmp_lock, arg_listp,
871 compile_startp);
872 } else {
873 regexp += char_size;
874 }
875 } else /* ((char_size == 0) && (next_argp == (char *)0)) */ {
876 if (pop_compilep() != (char *)0) {
877 /* unmatched parentheses */
878 ERROR_EXIT(®cmp_lock, arg_listp,
879 compile_startp);
880 }
881 *compilep = (unsigned char)END_REGEX;
882 compilep++;
883 *compilep = '\0';
884 compilep++;
885 __i_size = (int)(compilep - compile_startp);
886 va_end(arg_listp);
887 lmutex_unlock(®cmp_lock);
888 return (compile_startp);
889 }
890 } /* end for (;;) */
891
892 } /* regcmp() */
893
894
895 /* DEFINITIONS OF PRIVATE FUNCTIONS */
896
897 static int
add_char(char * compilep,wchar_t wchar)898 add_char(char *compilep, wchar_t wchar)
899 {
900 int expr_length;
901
902 if ((unsigned int)wchar <= (unsigned int)0x7f) {
903 *compilep = (unsigned char)wchar;
904 expr_length = 1;
905 } else {
906 expr_length = wctomb(compilep, wchar);
907 }
908 return (expr_length);
909 }
910
911 static int
add_single_char_expr(char * compilep,wchar_t wchar)912 add_single_char_expr(char *compilep, wchar_t wchar)
913 {
914 int expr_length = 0;
915
916 if ((unsigned int)wchar <= (unsigned int)0x7f) {
917 *compilep = (unsigned char)ASCII_CHAR;
918 compilep++;
919 *compilep = (unsigned char)wchar;
920 expr_length += 2;
921 } else {
922 *compilep = (unsigned char)MULTIBYTE_CHAR;
923 compilep++;
924 expr_length++;
925 expr_length += wctomb(compilep, wchar);
926 }
927 return (expr_length);
928 }
929
930 static int
get_count(int * countp,const char * regexp)931 get_count(int *countp, const char *regexp)
932 {
933 char count_char = '0';
934 int count = 0;
935 int count_length = 0;
936
937 if (regexp == (char *)0) {
938 return ((int)0);
939 } else {
940 count_char = *regexp;
941 while (('0' <= count_char) && (count_char <= '9')) {
942 count = (10 * count) + (int)(count_char - '0');
943 count_length++;
944 regexp++;
945 count_char = *regexp;
946 }
947 }
948 *countp = count;
949 return (count_length);
950 }
951
952 static int
get_digit(const char * regexp)953 get_digit(const char *regexp)
954 {
955 char digit;
956
957 if (regexp == (char *)0) {
958 return ((int)-1);
959 } else {
960 digit = *regexp;
961 if (('0' <= digit) && (digit <= '9')) {
962 return ((int)(digit - '0'));
963 } else {
964 return ((int)-1);
965 }
966 }
967 }
968
969 static int
get_wchar(wchar_t * wcharp,const char * regexp)970 get_wchar(wchar_t *wcharp, const char *regexp)
971 {
972 int char_size;
973
974 if (regexp == (char *)0) {
975 char_size = 0;
976 *wcharp = (wchar_t)((unsigned int)'\0');
977 } else if (*regexp == '\0') {
978 char_size = 0;
979 *wcharp = (wchar_t)((unsigned int)*regexp);
980 } else if ((unsigned char)*regexp <= (unsigned char)0x7f) {
981 char_size = 1;
982 *wcharp = (wchar_t)((unsigned int)*regexp);
983 } else {
984 char_size = mbtowc(wcharp, regexp, MB_LEN_MAX);
985 }
986 return (char_size);
987 }
988
989 static char *
pop_compilep(void)990 pop_compilep(void)
991 {
992 char *compilep;
993
994 if (compilep_stackp >= &compilep_stack[STRINGP_STACK_SIZE]) {
995 return ((char *)0);
996 } else {
997 compilep = *compilep_stackp;
998 compilep_stackp++;
999 return (compilep);
1000 }
1001 }
1002
1003 static char *
push_compilep(char * compilep)1004 push_compilep(char *compilep)
1005 {
1006 if (compilep_stackp <= &compilep_stack[0]) {
1007 return ((char *)0);
1008 } else {
1009 compilep_stackp--;
1010 *compilep_stackp = compilep;
1011 return (compilep);
1012 }
1013 }
1014
1015 static boolean_t
valid_range(wchar_t lower_char,wchar_t upper_char)1016 valid_range(wchar_t lower_char, wchar_t upper_char)
1017 {
1018 return (((lower_char <= 0x7f) && (upper_char <= 0x7f) &&
1019 !iswcntrl(lower_char) && !iswcntrl(upper_char) &&
1020 (lower_char < upper_char)) ||
1021 (((lower_char & WCHAR_CSMASK) ==
1022 (upper_char & WCHAR_CSMASK)) &&
1023 (lower_char < upper_char)));
1024 }
1025