xref: /minix3/external/bsd/libc++/dist/libcxx/src/regex.cpp (revision 4684ddb6aab0b36791c8099bc705d6140b3d05d0)
1*4684ddb6SLionel Sambuc //===-------------------------- regex.cpp ---------------------------------===//
2*4684ddb6SLionel Sambuc //
3*4684ddb6SLionel Sambuc //                     The LLVM Compiler Infrastructure
4*4684ddb6SLionel Sambuc //
5*4684ddb6SLionel Sambuc // This file is dual licensed under the MIT and the University of Illinois Open
6*4684ddb6SLionel Sambuc // Source Licenses. See LICENSE.TXT for details.
7*4684ddb6SLionel Sambuc //
8*4684ddb6SLionel Sambuc //===----------------------------------------------------------------------===//
9*4684ddb6SLionel Sambuc 
10*4684ddb6SLionel Sambuc #include "regex"
11*4684ddb6SLionel Sambuc #include "algorithm"
12*4684ddb6SLionel Sambuc #include "iterator"
13*4684ddb6SLionel Sambuc 
14*4684ddb6SLionel Sambuc _LIBCPP_BEGIN_NAMESPACE_STD
15*4684ddb6SLionel Sambuc 
16*4684ddb6SLionel Sambuc static
17*4684ddb6SLionel Sambuc const char*
18*4684ddb6SLionel Sambuc make_error_type_string(regex_constants::error_type ecode)
19*4684ddb6SLionel Sambuc {
20*4684ddb6SLionel Sambuc     switch (ecode)
21*4684ddb6SLionel Sambuc     {
22*4684ddb6SLionel Sambuc     case regex_constants::error_collate:
23*4684ddb6SLionel Sambuc         return "The expression contained an invalid collating element name.";
24*4684ddb6SLionel Sambuc     case regex_constants::error_ctype:
25*4684ddb6SLionel Sambuc         return "The expression contained an invalid character class name.";
26*4684ddb6SLionel Sambuc     case regex_constants::error_escape:
27*4684ddb6SLionel Sambuc         return "The expression contained an invalid escaped character, or a "
28*4684ddb6SLionel Sambuc                "trailing escape.";
29*4684ddb6SLionel Sambuc     case regex_constants::error_backref:
30*4684ddb6SLionel Sambuc         return "The expression contained an invalid back reference.";
31*4684ddb6SLionel Sambuc     case regex_constants::error_brack:
32*4684ddb6SLionel Sambuc         return "The expression contained mismatched [ and ].";
33*4684ddb6SLionel Sambuc     case regex_constants::error_paren:
34*4684ddb6SLionel Sambuc         return "The expression contained mismatched ( and ).";
35*4684ddb6SLionel Sambuc     case regex_constants::error_brace:
36*4684ddb6SLionel Sambuc         return "The expression contained mismatched { and }.";
37*4684ddb6SLionel Sambuc     case regex_constants::error_badbrace:
38*4684ddb6SLionel Sambuc         return "The expression contained an invalid range in a {} expression.";
39*4684ddb6SLionel Sambuc     case regex_constants::error_range:
40*4684ddb6SLionel Sambuc         return "The expression contained an invalid character range, "
41*4684ddb6SLionel Sambuc                "such as [b-a] in most encodings.";
42*4684ddb6SLionel Sambuc     case regex_constants::error_space:
43*4684ddb6SLionel Sambuc         return "There was insufficient memory to convert the expression into "
44*4684ddb6SLionel Sambuc                "a finite state machine.";
45*4684ddb6SLionel Sambuc     case regex_constants::error_badrepeat:
46*4684ddb6SLionel Sambuc         return "One of *?+{ was not preceded by a valid regular expression.";
47*4684ddb6SLionel Sambuc     case regex_constants::error_complexity:
48*4684ddb6SLionel Sambuc         return "The complexity of an attempted match against a regular "
49*4684ddb6SLionel Sambuc                "expression exceeded a pre-set level.";
50*4684ddb6SLionel Sambuc     case regex_constants::error_stack:
51*4684ddb6SLionel Sambuc         return "There was insufficient memory to determine whether the regular "
52*4684ddb6SLionel Sambuc                "expression could match the specified character sequence.";
53*4684ddb6SLionel Sambuc     case regex_constants::__re_err_grammar:
54*4684ddb6SLionel Sambuc         return "An invalid regex grammar has been requested.";
55*4684ddb6SLionel Sambuc     case regex_constants::__re_err_empty:
56*4684ddb6SLionel Sambuc         return "An empty regex is not allowed in the POSIX grammar.";
57*4684ddb6SLionel Sambuc     default:
58*4684ddb6SLionel Sambuc         break;
59*4684ddb6SLionel Sambuc     }
60*4684ddb6SLionel Sambuc     return "Unknown error type";
61*4684ddb6SLionel Sambuc }
62*4684ddb6SLionel Sambuc 
63*4684ddb6SLionel Sambuc regex_error::regex_error(regex_constants::error_type ecode)
64*4684ddb6SLionel Sambuc     : runtime_error(make_error_type_string(ecode)),
65*4684ddb6SLionel Sambuc       __code_(ecode)
66*4684ddb6SLionel Sambuc {}
67*4684ddb6SLionel Sambuc 
68*4684ddb6SLionel Sambuc regex_error::~regex_error() throw() {}
69*4684ddb6SLionel Sambuc 
70*4684ddb6SLionel Sambuc namespace {
71*4684ddb6SLionel Sambuc 
72*4684ddb6SLionel Sambuc #pragma clang diagnostic push
73*4684ddb6SLionel Sambuc #pragma clang diagnostic ignored "-Wpadded"
74*4684ddb6SLionel Sambuc 
75*4684ddb6SLionel Sambuc struct collationnames
76*4684ddb6SLionel Sambuc {
77*4684ddb6SLionel Sambuc     const char* elem_;
78*4684ddb6SLionel Sambuc     char char_;
79*4684ddb6SLionel Sambuc };
80*4684ddb6SLionel Sambuc 
81*4684ddb6SLionel Sambuc #pragma clang diagnostic pop
82*4684ddb6SLionel Sambuc 
83*4684ddb6SLionel Sambuc const collationnames collatenames[] =
84*4684ddb6SLionel Sambuc {
85*4684ddb6SLionel Sambuc     {"A", 0x41},
86*4684ddb6SLionel Sambuc     {"B", 0x42},
87*4684ddb6SLionel Sambuc     {"C", 0x43},
88*4684ddb6SLionel Sambuc     {"D", 0x44},
89*4684ddb6SLionel Sambuc     {"E", 0x45},
90*4684ddb6SLionel Sambuc     {"F", 0x46},
91*4684ddb6SLionel Sambuc     {"G", 0x47},
92*4684ddb6SLionel Sambuc     {"H", 0x48},
93*4684ddb6SLionel Sambuc     {"I", 0x49},
94*4684ddb6SLionel Sambuc     {"J", 0x4a},
95*4684ddb6SLionel Sambuc     {"K", 0x4b},
96*4684ddb6SLionel Sambuc     {"L", 0x4c},
97*4684ddb6SLionel Sambuc     {"M", 0x4d},
98*4684ddb6SLionel Sambuc     {"N", 0x4e},
99*4684ddb6SLionel Sambuc     {"NUL", 0x00},
100*4684ddb6SLionel Sambuc     {"O", 0x4f},
101*4684ddb6SLionel Sambuc     {"P", 0x50},
102*4684ddb6SLionel Sambuc     {"Q", 0x51},
103*4684ddb6SLionel Sambuc     {"R", 0x52},
104*4684ddb6SLionel Sambuc     {"S", 0x53},
105*4684ddb6SLionel Sambuc     {"T", 0x54},
106*4684ddb6SLionel Sambuc     {"U", 0x55},
107*4684ddb6SLionel Sambuc     {"V", 0x56},
108*4684ddb6SLionel Sambuc     {"W", 0x57},
109*4684ddb6SLionel Sambuc     {"X", 0x58},
110*4684ddb6SLionel Sambuc     {"Y", 0x59},
111*4684ddb6SLionel Sambuc     {"Z", 0x5a},
112*4684ddb6SLionel Sambuc     {"a", 0x61},
113*4684ddb6SLionel Sambuc     {"alert", 0x07},
114*4684ddb6SLionel Sambuc     {"ampersand", 0x26},
115*4684ddb6SLionel Sambuc     {"apostrophe", 0x27},
116*4684ddb6SLionel Sambuc     {"asterisk", 0x2a},
117*4684ddb6SLionel Sambuc     {"b", 0x62},
118*4684ddb6SLionel Sambuc     {"backslash", 0x5c},
119*4684ddb6SLionel Sambuc     {"backspace", 0x08},
120*4684ddb6SLionel Sambuc     {"c", 0x63},
121*4684ddb6SLionel Sambuc     {"carriage-return", 0x0d},
122*4684ddb6SLionel Sambuc     {"circumflex", 0x5e},
123*4684ddb6SLionel Sambuc     {"circumflex-accent", 0x5e},
124*4684ddb6SLionel Sambuc     {"colon", 0x3a},
125*4684ddb6SLionel Sambuc     {"comma", 0x2c},
126*4684ddb6SLionel Sambuc     {"commercial-at", 0x40},
127*4684ddb6SLionel Sambuc     {"d", 0x64},
128*4684ddb6SLionel Sambuc     {"dollar-sign", 0x24},
129*4684ddb6SLionel Sambuc     {"e", 0x65},
130*4684ddb6SLionel Sambuc     {"eight", 0x38},
131*4684ddb6SLionel Sambuc     {"equals-sign", 0x3d},
132*4684ddb6SLionel Sambuc     {"exclamation-mark", 0x21},
133*4684ddb6SLionel Sambuc     {"f", 0x66},
134*4684ddb6SLionel Sambuc     {"five", 0x35},
135*4684ddb6SLionel Sambuc     {"form-feed", 0x0c},
136*4684ddb6SLionel Sambuc     {"four", 0x34},
137*4684ddb6SLionel Sambuc     {"full-stop", 0x2e},
138*4684ddb6SLionel Sambuc     {"g", 0x67},
139*4684ddb6SLionel Sambuc     {"grave-accent", 0x60},
140*4684ddb6SLionel Sambuc     {"greater-than-sign", 0x3e},
141*4684ddb6SLionel Sambuc     {"h", 0x68},
142*4684ddb6SLionel Sambuc     {"hyphen", 0x2d},
143*4684ddb6SLionel Sambuc     {"hyphen-minus", 0x2d},
144*4684ddb6SLionel Sambuc     {"i", 0x69},
145*4684ddb6SLionel Sambuc     {"j", 0x6a},
146*4684ddb6SLionel Sambuc     {"k", 0x6b},
147*4684ddb6SLionel Sambuc     {"l", 0x6c},
148*4684ddb6SLionel Sambuc     {"left-brace", 0x7b},
149*4684ddb6SLionel Sambuc     {"left-curly-bracket", 0x7b},
150*4684ddb6SLionel Sambuc     {"left-parenthesis", 0x28},
151*4684ddb6SLionel Sambuc     {"left-square-bracket", 0x5b},
152*4684ddb6SLionel Sambuc     {"less-than-sign", 0x3c},
153*4684ddb6SLionel Sambuc     {"low-line", 0x5f},
154*4684ddb6SLionel Sambuc     {"m", 0x6d},
155*4684ddb6SLionel Sambuc     {"n", 0x6e},
156*4684ddb6SLionel Sambuc     {"newline", 0x0a},
157*4684ddb6SLionel Sambuc     {"nine", 0x39},
158*4684ddb6SLionel Sambuc     {"number-sign", 0x23},
159*4684ddb6SLionel Sambuc     {"o", 0x6f},
160*4684ddb6SLionel Sambuc     {"one", 0x31},
161*4684ddb6SLionel Sambuc     {"p", 0x70},
162*4684ddb6SLionel Sambuc     {"percent-sign", 0x25},
163*4684ddb6SLionel Sambuc     {"period", 0x2e},
164*4684ddb6SLionel Sambuc     {"plus-sign", 0x2b},
165*4684ddb6SLionel Sambuc     {"q", 0x71},
166*4684ddb6SLionel Sambuc     {"question-mark", 0x3f},
167*4684ddb6SLionel Sambuc     {"quotation-mark", 0x22},
168*4684ddb6SLionel Sambuc     {"r", 0x72},
169*4684ddb6SLionel Sambuc     {"reverse-solidus", 0x5c},
170*4684ddb6SLionel Sambuc     {"right-brace", 0x7d},
171*4684ddb6SLionel Sambuc     {"right-curly-bracket", 0x7d},
172*4684ddb6SLionel Sambuc     {"right-parenthesis", 0x29},
173*4684ddb6SLionel Sambuc     {"right-square-bracket", 0x5d},
174*4684ddb6SLionel Sambuc     {"s", 0x73},
175*4684ddb6SLionel Sambuc     {"semicolon", 0x3b},
176*4684ddb6SLionel Sambuc     {"seven", 0x37},
177*4684ddb6SLionel Sambuc     {"six", 0x36},
178*4684ddb6SLionel Sambuc     {"slash", 0x2f},
179*4684ddb6SLionel Sambuc     {"solidus", 0x2f},
180*4684ddb6SLionel Sambuc     {"space", 0x20},
181*4684ddb6SLionel Sambuc     {"t", 0x74},
182*4684ddb6SLionel Sambuc     {"tab", 0x09},
183*4684ddb6SLionel Sambuc     {"three", 0x33},
184*4684ddb6SLionel Sambuc     {"tilde", 0x7e},
185*4684ddb6SLionel Sambuc     {"two", 0x32},
186*4684ddb6SLionel Sambuc     {"u", 0x75},
187*4684ddb6SLionel Sambuc     {"underscore", 0x5f},
188*4684ddb6SLionel Sambuc     {"v", 0x76},
189*4684ddb6SLionel Sambuc     {"vertical-line", 0x7c},
190*4684ddb6SLionel Sambuc     {"vertical-tab", 0x0b},
191*4684ddb6SLionel Sambuc     {"w", 0x77},
192*4684ddb6SLionel Sambuc     {"x", 0x78},
193*4684ddb6SLionel Sambuc     {"y", 0x79},
194*4684ddb6SLionel Sambuc     {"z", 0x7a},
195*4684ddb6SLionel Sambuc     {"zero", 0x30}
196*4684ddb6SLionel Sambuc };
197*4684ddb6SLionel Sambuc 
198*4684ddb6SLionel Sambuc #pragma clang diagnostic push
199*4684ddb6SLionel Sambuc #pragma clang diagnostic ignored "-Wpadded"
200*4684ddb6SLionel Sambuc 
201*4684ddb6SLionel Sambuc struct classnames
202*4684ddb6SLionel Sambuc {
203*4684ddb6SLionel Sambuc     const char* elem_;
204*4684ddb6SLionel Sambuc     ctype_base::mask mask_;
205*4684ddb6SLionel Sambuc };
206*4684ddb6SLionel Sambuc 
207*4684ddb6SLionel Sambuc #pragma clang diagnostic pop
208*4684ddb6SLionel Sambuc 
209*4684ddb6SLionel Sambuc const classnames ClassNames[] =
210*4684ddb6SLionel Sambuc {
211*4684ddb6SLionel Sambuc     {"alnum",  ctype_base::alnum},
212*4684ddb6SLionel Sambuc     {"alpha",  ctype_base::alpha},
213*4684ddb6SLionel Sambuc     {"blank",  ctype_base::blank},
214*4684ddb6SLionel Sambuc     {"cntrl",  ctype_base::cntrl},
215*4684ddb6SLionel Sambuc     {"d",      ctype_base::digit},
216*4684ddb6SLionel Sambuc     {"digit",  ctype_base::digit},
217*4684ddb6SLionel Sambuc     {"graph",  ctype_base::graph},
218*4684ddb6SLionel Sambuc     {"lower",  ctype_base::lower},
219*4684ddb6SLionel Sambuc     {"print",  ctype_base::print},
220*4684ddb6SLionel Sambuc     {"punct",  ctype_base::punct},
221*4684ddb6SLionel Sambuc     {"s",      ctype_base::space},
222*4684ddb6SLionel Sambuc     {"space",  ctype_base::space},
223*4684ddb6SLionel Sambuc     {"upper",  ctype_base::upper},
224*4684ddb6SLionel Sambuc     {"w",      regex_traits<char>::__regex_word},
225*4684ddb6SLionel Sambuc     {"xdigit", ctype_base::xdigit}
226*4684ddb6SLionel Sambuc };
227*4684ddb6SLionel Sambuc 
228*4684ddb6SLionel Sambuc struct use_strcmp
229*4684ddb6SLionel Sambuc {
230*4684ddb6SLionel Sambuc     bool operator()(const collationnames& x, const char* y)
231*4684ddb6SLionel Sambuc         {return strcmp(x.elem_, y) < 0;}
232*4684ddb6SLionel Sambuc     bool operator()(const classnames& x, const char* y)
233*4684ddb6SLionel Sambuc         {return strcmp(x.elem_, y) < 0;}
234*4684ddb6SLionel Sambuc };
235*4684ddb6SLionel Sambuc 
236*4684ddb6SLionel Sambuc }
237*4684ddb6SLionel Sambuc 
238*4684ddb6SLionel Sambuc string
239*4684ddb6SLionel Sambuc __get_collation_name(const char* s)
240*4684ddb6SLionel Sambuc {
241*4684ddb6SLionel Sambuc     const collationnames* i =
242*4684ddb6SLionel Sambuc             _VSTD::lower_bound(begin(collatenames), end(collatenames), s, use_strcmp());
243*4684ddb6SLionel Sambuc     string r;
244*4684ddb6SLionel Sambuc     if (i != end(collatenames) && strcmp(s, i->elem_) == 0)
245*4684ddb6SLionel Sambuc         r = char(i->char_);
246*4684ddb6SLionel Sambuc     return r;
247*4684ddb6SLionel Sambuc }
248*4684ddb6SLionel Sambuc 
249*4684ddb6SLionel Sambuc ctype_base::mask
250*4684ddb6SLionel Sambuc __get_classname(const char* s, bool __icase)
251*4684ddb6SLionel Sambuc {
252*4684ddb6SLionel Sambuc     const classnames* i =
253*4684ddb6SLionel Sambuc             _VSTD::lower_bound(begin(ClassNames), end(ClassNames), s, use_strcmp());
254*4684ddb6SLionel Sambuc     ctype_base::mask r = 0;
255*4684ddb6SLionel Sambuc     if (i != end(ClassNames) && strcmp(s, i->elem_) == 0)
256*4684ddb6SLionel Sambuc     {
257*4684ddb6SLionel Sambuc         r = i->mask_;
258*4684ddb6SLionel Sambuc         if (r == regex_traits<char>::__regex_word)
259*4684ddb6SLionel Sambuc             r |= ctype_base::alnum | ctype_base::upper | ctype_base::lower;
260*4684ddb6SLionel Sambuc         else if (__icase)
261*4684ddb6SLionel Sambuc         {
262*4684ddb6SLionel Sambuc             if (r & (ctype_base::lower | ctype_base::upper))
263*4684ddb6SLionel Sambuc                 r |= ctype_base::alpha;
264*4684ddb6SLionel Sambuc         }
265*4684ddb6SLionel Sambuc     }
266*4684ddb6SLionel Sambuc     return r;
267*4684ddb6SLionel Sambuc }
268*4684ddb6SLionel Sambuc 
269*4684ddb6SLionel Sambuc template <>
270*4684ddb6SLionel Sambuc void
271*4684ddb6SLionel Sambuc __match_any_but_newline<char>::__exec(__state& __s) const
272*4684ddb6SLionel Sambuc {
273*4684ddb6SLionel Sambuc     if (__s.__current_ != __s.__last_)
274*4684ddb6SLionel Sambuc     {
275*4684ddb6SLionel Sambuc         switch (*__s.__current_)
276*4684ddb6SLionel Sambuc         {
277*4684ddb6SLionel Sambuc         case '\r':
278*4684ddb6SLionel Sambuc         case '\n':
279*4684ddb6SLionel Sambuc             __s.__do_ = __state::__reject;
280*4684ddb6SLionel Sambuc             __s.__node_ = nullptr;
281*4684ddb6SLionel Sambuc             break;
282*4684ddb6SLionel Sambuc         default:
283*4684ddb6SLionel Sambuc             __s.__do_ = __state::__accept_and_consume;
284*4684ddb6SLionel Sambuc             ++__s.__current_;
285*4684ddb6SLionel Sambuc             __s.__node_ = this->first();
286*4684ddb6SLionel Sambuc             break;
287*4684ddb6SLionel Sambuc         }
288*4684ddb6SLionel Sambuc     }
289*4684ddb6SLionel Sambuc     else
290*4684ddb6SLionel Sambuc     {
291*4684ddb6SLionel Sambuc         __s.__do_ = __state::__reject;
292*4684ddb6SLionel Sambuc         __s.__node_ = nullptr;
293*4684ddb6SLionel Sambuc     }
294*4684ddb6SLionel Sambuc }
295*4684ddb6SLionel Sambuc 
296*4684ddb6SLionel Sambuc template <>
297*4684ddb6SLionel Sambuc void
298*4684ddb6SLionel Sambuc __match_any_but_newline<wchar_t>::__exec(__state& __s) const
299*4684ddb6SLionel Sambuc {
300*4684ddb6SLionel Sambuc     if (__s.__current_ != __s.__last_)
301*4684ddb6SLionel Sambuc     {
302*4684ddb6SLionel Sambuc         switch (*__s.__current_)
303*4684ddb6SLionel Sambuc         {
304*4684ddb6SLionel Sambuc         case '\r':
305*4684ddb6SLionel Sambuc         case '\n':
306*4684ddb6SLionel Sambuc         case 0x2028:
307*4684ddb6SLionel Sambuc         case 0x2029:
308*4684ddb6SLionel Sambuc             __s.__do_ = __state::__reject;
309*4684ddb6SLionel Sambuc             __s.__node_ = nullptr;
310*4684ddb6SLionel Sambuc             break;
311*4684ddb6SLionel Sambuc         default:
312*4684ddb6SLionel Sambuc             __s.__do_ = __state::__accept_and_consume;
313*4684ddb6SLionel Sambuc             ++__s.__current_;
314*4684ddb6SLionel Sambuc             __s.__node_ = this->first();
315*4684ddb6SLionel Sambuc             break;
316*4684ddb6SLionel Sambuc         }
317*4684ddb6SLionel Sambuc     }
318*4684ddb6SLionel Sambuc     else
319*4684ddb6SLionel Sambuc     {
320*4684ddb6SLionel Sambuc         __s.__do_ = __state::__reject;
321*4684ddb6SLionel Sambuc         __s.__node_ = nullptr;
322*4684ddb6SLionel Sambuc     }
323*4684ddb6SLionel Sambuc }
324*4684ddb6SLionel Sambuc 
325*4684ddb6SLionel Sambuc _LIBCPP_END_NAMESPACE_STD
326