1*38fd1498Szrj // class template regex -*- C++ -*- 2*38fd1498Szrj 3*38fd1498Szrj // Copyright (C) 2013-2018 Free Software Foundation, Inc. 4*38fd1498Szrj // 5*38fd1498Szrj // This file is part of the GNU ISO C++ Library. This library is free 6*38fd1498Szrj // software; you can redistribute it and/or modify it under the 7*38fd1498Szrj // terms of the GNU General Public License as published by the 8*38fd1498Szrj // Free Software Foundation; either version 3, or (at your option) 9*38fd1498Szrj // any later version. 10*38fd1498Szrj 11*38fd1498Szrj // This library is distributed in the hope that it will be useful, 12*38fd1498Szrj // but WITHOUT ANY WARRANTY; without even the implied warranty of 13*38fd1498Szrj // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 14*38fd1498Szrj // GNU General Public License for more details. 15*38fd1498Szrj 16*38fd1498Szrj // Under Section 7 of GPL version 3, you are granted additional 17*38fd1498Szrj // permissions described in the GCC Runtime Library Exception, version 18*38fd1498Szrj // 3.1, as published by the Free Software Foundation. 19*38fd1498Szrj 20*38fd1498Szrj // You should have received a copy of the GNU General Public License and 21*38fd1498Szrj // a copy of the GCC Runtime Library Exception along with this program; 22*38fd1498Szrj // see the files COPYING3 and COPYING.RUNTIME respectively. If not, see 23*38fd1498Szrj // <http://www.gnu.org/licenses/>. 24*38fd1498Szrj 25*38fd1498Szrj /** 26*38fd1498Szrj * @file bits/regex_compiler.tcc 27*38fd1498Szrj * This is an internal header file, included by other library headers. 28*38fd1498Szrj * Do not attempt to use it directly. @headername{regex} 29*38fd1498Szrj */ 30*38fd1498Szrj 31*38fd1498Szrj // FIXME make comments doxygen format. 32*38fd1498Szrj 33*38fd1498Szrj /* 34*38fd1498Szrj // This compiler refers to "Regular Expression Matching Can Be Simple And Fast" 35*38fd1498Szrj // (http://swtch.com/~rsc/regexp/regexp1.html), 36*38fd1498Szrj // but doesn't strictly follow it. 37*38fd1498Szrj // 38*38fd1498Szrj // When compiling, states are *chained* instead of tree- or graph-constructed. 39*38fd1498Szrj // It's more like structured programs: there's if statement and loop statement. 40*38fd1498Szrj // 41*38fd1498Szrj // For alternative structure (say "a|b"), aka "if statement", two branches 42*38fd1498Szrj // should be constructed. However, these two shall merge to an "end_tag" at 43*38fd1498Szrj // the end of this operator: 44*38fd1498Szrj // 45*38fd1498Szrj // branch1 46*38fd1498Szrj // / \ 47*38fd1498Szrj // => begin_tag end_tag => 48*38fd1498Szrj // \ / 49*38fd1498Szrj // branch2 50*38fd1498Szrj // 51*38fd1498Szrj // This is the difference between this implementation and that in Russ's 52*38fd1498Szrj // article. 53*38fd1498Szrj // 54*38fd1498Szrj // That's why we introduced dummy node here ------ "end_tag" is a dummy node. 55*38fd1498Szrj // All dummy nodes will be eliminated at the end of compilation. 56*38fd1498Szrj */ 57*38fd1498Szrj 58*38fd1498Szrj namespace std _GLIBCXX_VISIBILITY(default) 59*38fd1498Szrj { 60*38fd1498Szrj _GLIBCXX_BEGIN_NAMESPACE_VERSION 61*38fd1498Szrj 62*38fd1498Szrj namespace __detail 63*38fd1498Szrj { 64*38fd1498Szrj template<typename _TraitsT> 65*38fd1498Szrj _Compiler<_TraitsT>:: 66*38fd1498Szrj _Compiler(_IterT __b, _IterT __e, 67*38fd1498Szrj const typename _TraitsT::locale_type& __loc, _FlagT __flags) 68*38fd1498Szrj : _M_flags((__flags 69*38fd1498Szrj & (regex_constants::ECMAScript 70*38fd1498Szrj | regex_constants::basic 71*38fd1498Szrj | regex_constants::extended 72*38fd1498Szrj | regex_constants::grep 73*38fd1498Szrj | regex_constants::egrep 74*38fd1498Szrj | regex_constants::awk)) 75*38fd1498Szrj ? __flags 76*38fd1498Szrj : __flags | regex_constants::ECMAScript), 77*38fd1498Szrj _M_scanner(__b, __e, _M_flags, __loc), 78*38fd1498Szrj _M_nfa(make_shared<_RegexT>(__loc, _M_flags)), 79*38fd1498Szrj _M_traits(_M_nfa->_M_traits), 80*38fd1498Szrj _M_ctype(std::use_facet<_CtypeT>(__loc)) 81*38fd1498Szrj { 82*38fd1498Szrj _StateSeqT __r(*_M_nfa, _M_nfa->_M_start()); 83*38fd1498Szrj __r._M_append(_M_nfa->_M_insert_subexpr_begin()); 84*38fd1498Szrj this->_M_disjunction(); 85*38fd1498Szrj if (!_M_match_token(_ScannerT::_S_token_eof)) 86*38fd1498Szrj __throw_regex_error(regex_constants::error_paren); 87*38fd1498Szrj __r._M_append(_M_pop()); 88*38fd1498Szrj __glibcxx_assert(_M_stack.empty()); 89*38fd1498Szrj __r._M_append(_M_nfa->_M_insert_subexpr_end()); 90*38fd1498Szrj __r._M_append(_M_nfa->_M_insert_accept()); 91*38fd1498Szrj _M_nfa->_M_eliminate_dummy(); 92*38fd1498Szrj } 93*38fd1498Szrj 94*38fd1498Szrj template<typename _TraitsT> 95*38fd1498Szrj void 96*38fd1498Szrj _Compiler<_TraitsT>:: 97*38fd1498Szrj _M_disjunction() 98*38fd1498Szrj { 99*38fd1498Szrj this->_M_alternative(); 100*38fd1498Szrj while (_M_match_token(_ScannerT::_S_token_or)) 101*38fd1498Szrj { 102*38fd1498Szrj _StateSeqT __alt1 = _M_pop(); 103*38fd1498Szrj this->_M_alternative(); 104*38fd1498Szrj _StateSeqT __alt2 = _M_pop(); 105*38fd1498Szrj auto __end = _M_nfa->_M_insert_dummy(); 106*38fd1498Szrj __alt1._M_append(__end); 107*38fd1498Szrj __alt2._M_append(__end); 108*38fd1498Szrj // __alt2 is state._M_next, __alt1 is state._M_alt. The executor 109*38fd1498Szrj // executes _M_alt before _M_next, as well as executing left 110*38fd1498Szrj // alternative before right one. 111*38fd1498Szrj _M_stack.push(_StateSeqT(*_M_nfa, 112*38fd1498Szrj _M_nfa->_M_insert_alt( 113*38fd1498Szrj __alt2._M_start, __alt1._M_start, false), 114*38fd1498Szrj __end)); 115*38fd1498Szrj } 116*38fd1498Szrj } 117*38fd1498Szrj 118*38fd1498Szrj template<typename _TraitsT> 119*38fd1498Szrj void 120*38fd1498Szrj _Compiler<_TraitsT>:: 121*38fd1498Szrj _M_alternative() 122*38fd1498Szrj { 123*38fd1498Szrj if (this->_M_term()) 124*38fd1498Szrj { 125*38fd1498Szrj _StateSeqT __re = _M_pop(); 126*38fd1498Szrj this->_M_alternative(); 127*38fd1498Szrj __re._M_append(_M_pop()); 128*38fd1498Szrj _M_stack.push(__re); 129*38fd1498Szrj } 130*38fd1498Szrj else 131*38fd1498Szrj _M_stack.push(_StateSeqT(*_M_nfa, _M_nfa->_M_insert_dummy())); 132*38fd1498Szrj } 133*38fd1498Szrj 134*38fd1498Szrj template<typename _TraitsT> 135*38fd1498Szrj bool 136*38fd1498Szrj _Compiler<_TraitsT>:: 137*38fd1498Szrj _M_term() 138*38fd1498Szrj { 139*38fd1498Szrj if (this->_M_assertion()) 140*38fd1498Szrj return true; 141*38fd1498Szrj if (this->_M_atom()) 142*38fd1498Szrj { 143*38fd1498Szrj while (this->_M_quantifier()); 144*38fd1498Szrj return true; 145*38fd1498Szrj } 146*38fd1498Szrj return false; 147*38fd1498Szrj } 148*38fd1498Szrj 149*38fd1498Szrj template<typename _TraitsT> 150*38fd1498Szrj bool 151*38fd1498Szrj _Compiler<_TraitsT>:: 152*38fd1498Szrj _M_assertion() 153*38fd1498Szrj { 154*38fd1498Szrj if (_M_match_token(_ScannerT::_S_token_line_begin)) 155*38fd1498Szrj _M_stack.push(_StateSeqT(*_M_nfa, _M_nfa->_M_insert_line_begin())); 156*38fd1498Szrj else if (_M_match_token(_ScannerT::_S_token_line_end)) 157*38fd1498Szrj _M_stack.push(_StateSeqT(*_M_nfa, _M_nfa->_M_insert_line_end())); 158*38fd1498Szrj else if (_M_match_token(_ScannerT::_S_token_word_bound)) 159*38fd1498Szrj // _M_value[0] == 'n' means it's negative, say "not word boundary". 160*38fd1498Szrj _M_stack.push(_StateSeqT(*_M_nfa, _M_nfa-> 161*38fd1498Szrj _M_insert_word_bound(_M_value[0] == 'n'))); 162*38fd1498Szrj else if (_M_match_token(_ScannerT::_S_token_subexpr_lookahead_begin)) 163*38fd1498Szrj { 164*38fd1498Szrj auto __neg = _M_value[0] == 'n'; 165*38fd1498Szrj this->_M_disjunction(); 166*38fd1498Szrj if (!_M_match_token(_ScannerT::_S_token_subexpr_end)) 167*38fd1498Szrj __throw_regex_error(regex_constants::error_paren, 168*38fd1498Szrj "Parenthesis is not closed."); 169*38fd1498Szrj auto __tmp = _M_pop(); 170*38fd1498Szrj __tmp._M_append(_M_nfa->_M_insert_accept()); 171*38fd1498Szrj _M_stack.push( 172*38fd1498Szrj _StateSeqT( 173*38fd1498Szrj *_M_nfa, 174*38fd1498Szrj _M_nfa->_M_insert_lookahead(__tmp._M_start, __neg))); 175*38fd1498Szrj } 176*38fd1498Szrj else 177*38fd1498Szrj return false; 178*38fd1498Szrj return true; 179*38fd1498Szrj } 180*38fd1498Szrj 181*38fd1498Szrj template<typename _TraitsT> 182*38fd1498Szrj bool 183*38fd1498Szrj _Compiler<_TraitsT>:: 184*38fd1498Szrj _M_quantifier() 185*38fd1498Szrj { 186*38fd1498Szrj bool __neg = (_M_flags & regex_constants::ECMAScript); 187*38fd1498Szrj auto __init = [this, &__neg]() 188*38fd1498Szrj { 189*38fd1498Szrj if (_M_stack.empty()) 190*38fd1498Szrj __throw_regex_error(regex_constants::error_badrepeat, 191*38fd1498Szrj "Nothing to repeat before a quantifier."); 192*38fd1498Szrj __neg = __neg && _M_match_token(_ScannerT::_S_token_opt); 193*38fd1498Szrj }; 194*38fd1498Szrj if (_M_match_token(_ScannerT::_S_token_closure0)) 195*38fd1498Szrj { 196*38fd1498Szrj __init(); 197*38fd1498Szrj auto __e = _M_pop(); 198*38fd1498Szrj _StateSeqT __r(*_M_nfa, 199*38fd1498Szrj _M_nfa->_M_insert_repeat(_S_invalid_state_id, 200*38fd1498Szrj __e._M_start, __neg)); 201*38fd1498Szrj __e._M_append(__r); 202*38fd1498Szrj _M_stack.push(__r); 203*38fd1498Szrj } 204*38fd1498Szrj else if (_M_match_token(_ScannerT::_S_token_closure1)) 205*38fd1498Szrj { 206*38fd1498Szrj __init(); 207*38fd1498Szrj auto __e = _M_pop(); 208*38fd1498Szrj __e._M_append(_M_nfa->_M_insert_repeat(_S_invalid_state_id, 209*38fd1498Szrj __e._M_start, __neg)); 210*38fd1498Szrj _M_stack.push(__e); 211*38fd1498Szrj } 212*38fd1498Szrj else if (_M_match_token(_ScannerT::_S_token_opt)) 213*38fd1498Szrj { 214*38fd1498Szrj __init(); 215*38fd1498Szrj auto __e = _M_pop(); 216*38fd1498Szrj auto __end = _M_nfa->_M_insert_dummy(); 217*38fd1498Szrj _StateSeqT __r(*_M_nfa, 218*38fd1498Szrj _M_nfa->_M_insert_repeat(_S_invalid_state_id, 219*38fd1498Szrj __e._M_start, __neg)); 220*38fd1498Szrj __e._M_append(__end); 221*38fd1498Szrj __r._M_append(__end); 222*38fd1498Szrj _M_stack.push(__r); 223*38fd1498Szrj } 224*38fd1498Szrj else if (_M_match_token(_ScannerT::_S_token_interval_begin)) 225*38fd1498Szrj { 226*38fd1498Szrj if (_M_stack.empty()) 227*38fd1498Szrj __throw_regex_error(regex_constants::error_badrepeat, 228*38fd1498Szrj "Nothing to repeat before a quantifier."); 229*38fd1498Szrj if (!_M_match_token(_ScannerT::_S_token_dup_count)) 230*38fd1498Szrj __throw_regex_error(regex_constants::error_badbrace, 231*38fd1498Szrj "Unexpected token in brace expression."); 232*38fd1498Szrj _StateSeqT __r(_M_pop()); 233*38fd1498Szrj _StateSeqT __e(*_M_nfa, _M_nfa->_M_insert_dummy()); 234*38fd1498Szrj long __min_rep = _M_cur_int_value(10); 235*38fd1498Szrj bool __infi = false; 236*38fd1498Szrj long __n; 237*38fd1498Szrj 238*38fd1498Szrj // {3 239*38fd1498Szrj if (_M_match_token(_ScannerT::_S_token_comma)) 240*38fd1498Szrj if (_M_match_token(_ScannerT::_S_token_dup_count)) // {3,7} 241*38fd1498Szrj __n = _M_cur_int_value(10) - __min_rep; 242*38fd1498Szrj else 243*38fd1498Szrj __infi = true; 244*38fd1498Szrj else 245*38fd1498Szrj __n = 0; 246*38fd1498Szrj if (!_M_match_token(_ScannerT::_S_token_interval_end)) 247*38fd1498Szrj __throw_regex_error(regex_constants::error_brace, 248*38fd1498Szrj "Unexpected end of brace expression."); 249*38fd1498Szrj 250*38fd1498Szrj __neg = __neg && _M_match_token(_ScannerT::_S_token_opt); 251*38fd1498Szrj 252*38fd1498Szrj for (long __i = 0; __i < __min_rep; ++__i) 253*38fd1498Szrj __e._M_append(__r._M_clone()); 254*38fd1498Szrj 255*38fd1498Szrj if (__infi) 256*38fd1498Szrj { 257*38fd1498Szrj auto __tmp = __r._M_clone(); 258*38fd1498Szrj _StateSeqT __s(*_M_nfa, 259*38fd1498Szrj _M_nfa->_M_insert_repeat(_S_invalid_state_id, 260*38fd1498Szrj __tmp._M_start, __neg)); 261*38fd1498Szrj __tmp._M_append(__s); 262*38fd1498Szrj __e._M_append(__s); 263*38fd1498Szrj } 264*38fd1498Szrj else 265*38fd1498Szrj { 266*38fd1498Szrj if (__n < 0) 267*38fd1498Szrj __throw_regex_error(regex_constants::error_badbrace, 268*38fd1498Szrj "Invalid range in brace expression."); 269*38fd1498Szrj auto __end = _M_nfa->_M_insert_dummy(); 270*38fd1498Szrj // _M_alt is the "match more" branch, and _M_next is the 271*38fd1498Szrj // "match less" one. Switch _M_alt and _M_next of all created 272*38fd1498Szrj // nodes. This is a hack but IMO works well. 273*38fd1498Szrj std::stack<_StateIdT> __stack; 274*38fd1498Szrj for (long __i = 0; __i < __n; ++__i) 275*38fd1498Szrj { 276*38fd1498Szrj auto __tmp = __r._M_clone(); 277*38fd1498Szrj auto __alt = _M_nfa->_M_insert_repeat(__tmp._M_start, 278*38fd1498Szrj __end, __neg); 279*38fd1498Szrj __stack.push(__alt); 280*38fd1498Szrj __e._M_append(_StateSeqT(*_M_nfa, __alt, __tmp._M_end)); 281*38fd1498Szrj } 282*38fd1498Szrj __e._M_append(__end); 283*38fd1498Szrj while (!__stack.empty()) 284*38fd1498Szrj { 285*38fd1498Szrj auto& __tmp = (*_M_nfa)[__stack.top()]; 286*38fd1498Szrj __stack.pop(); 287*38fd1498Szrj std::swap(__tmp._M_next, __tmp._M_alt); 288*38fd1498Szrj } 289*38fd1498Szrj } 290*38fd1498Szrj _M_stack.push(__e); 291*38fd1498Szrj } 292*38fd1498Szrj else 293*38fd1498Szrj return false; 294*38fd1498Szrj return true; 295*38fd1498Szrj } 296*38fd1498Szrj 297*38fd1498Szrj #define __INSERT_REGEX_MATCHER(__func, ...)\ 298*38fd1498Szrj do\ 299*38fd1498Szrj if (!(_M_flags & regex_constants::icase))\ 300*38fd1498Szrj if (!(_M_flags & regex_constants::collate))\ 301*38fd1498Szrj __func<false, false>(__VA_ARGS__);\ 302*38fd1498Szrj else\ 303*38fd1498Szrj __func<false, true>(__VA_ARGS__);\ 304*38fd1498Szrj else\ 305*38fd1498Szrj if (!(_M_flags & regex_constants::collate))\ 306*38fd1498Szrj __func<true, false>(__VA_ARGS__);\ 307*38fd1498Szrj else\ 308*38fd1498Szrj __func<true, true>(__VA_ARGS__);\ 309*38fd1498Szrj while (false) 310*38fd1498Szrj 311*38fd1498Szrj template<typename _TraitsT> 312*38fd1498Szrj bool 313*38fd1498Szrj _Compiler<_TraitsT>:: 314*38fd1498Szrj _M_atom() 315*38fd1498Szrj { 316*38fd1498Szrj if (_M_match_token(_ScannerT::_S_token_anychar)) 317*38fd1498Szrj { 318*38fd1498Szrj if (!(_M_flags & regex_constants::ECMAScript)) 319*38fd1498Szrj __INSERT_REGEX_MATCHER(_M_insert_any_matcher_posix); 320*38fd1498Szrj else 321*38fd1498Szrj __INSERT_REGEX_MATCHER(_M_insert_any_matcher_ecma); 322*38fd1498Szrj } 323*38fd1498Szrj else if (_M_try_char()) 324*38fd1498Szrj __INSERT_REGEX_MATCHER(_M_insert_char_matcher); 325*38fd1498Szrj else if (_M_match_token(_ScannerT::_S_token_backref)) 326*38fd1498Szrj _M_stack.push(_StateSeqT(*_M_nfa, _M_nfa-> 327*38fd1498Szrj _M_insert_backref(_M_cur_int_value(10)))); 328*38fd1498Szrj else if (_M_match_token(_ScannerT::_S_token_quoted_class)) 329*38fd1498Szrj __INSERT_REGEX_MATCHER(_M_insert_character_class_matcher); 330*38fd1498Szrj else if (_M_match_token(_ScannerT::_S_token_subexpr_no_group_begin)) 331*38fd1498Szrj { 332*38fd1498Szrj _StateSeqT __r(*_M_nfa, _M_nfa->_M_insert_dummy()); 333*38fd1498Szrj this->_M_disjunction(); 334*38fd1498Szrj if (!_M_match_token(_ScannerT::_S_token_subexpr_end)) 335*38fd1498Szrj __throw_regex_error(regex_constants::error_paren, 336*38fd1498Szrj "Parenthesis is not closed."); 337*38fd1498Szrj __r._M_append(_M_pop()); 338*38fd1498Szrj _M_stack.push(__r); 339*38fd1498Szrj } 340*38fd1498Szrj else if (_M_match_token(_ScannerT::_S_token_subexpr_begin)) 341*38fd1498Szrj { 342*38fd1498Szrj _StateSeqT __r(*_M_nfa, _M_nfa->_M_insert_subexpr_begin()); 343*38fd1498Szrj this->_M_disjunction(); 344*38fd1498Szrj if (!_M_match_token(_ScannerT::_S_token_subexpr_end)) 345*38fd1498Szrj __throw_regex_error(regex_constants::error_paren, 346*38fd1498Szrj "Parenthesis is not closed."); 347*38fd1498Szrj __r._M_append(_M_pop()); 348*38fd1498Szrj __r._M_append(_M_nfa->_M_insert_subexpr_end()); 349*38fd1498Szrj _M_stack.push(__r); 350*38fd1498Szrj } 351*38fd1498Szrj else if (!_M_bracket_expression()) 352*38fd1498Szrj return false; 353*38fd1498Szrj return true; 354*38fd1498Szrj } 355*38fd1498Szrj 356*38fd1498Szrj template<typename _TraitsT> 357*38fd1498Szrj bool 358*38fd1498Szrj _Compiler<_TraitsT>:: 359*38fd1498Szrj _M_bracket_expression() 360*38fd1498Szrj { 361*38fd1498Szrj bool __neg = 362*38fd1498Szrj _M_match_token(_ScannerT::_S_token_bracket_neg_begin); 363*38fd1498Szrj if (!(__neg || _M_match_token(_ScannerT::_S_token_bracket_begin))) 364*38fd1498Szrj return false; 365*38fd1498Szrj __INSERT_REGEX_MATCHER(_M_insert_bracket_matcher, __neg); 366*38fd1498Szrj return true; 367*38fd1498Szrj } 368*38fd1498Szrj #undef __INSERT_REGEX_MATCHER 369*38fd1498Szrj 370*38fd1498Szrj template<typename _TraitsT> 371*38fd1498Szrj template<bool __icase, bool __collate> 372*38fd1498Szrj void 373*38fd1498Szrj _Compiler<_TraitsT>:: 374*38fd1498Szrj _M_insert_any_matcher_ecma() 375*38fd1498Szrj { 376*38fd1498Szrj _M_stack.push(_StateSeqT(*_M_nfa, 377*38fd1498Szrj _M_nfa->_M_insert_matcher 378*38fd1498Szrj (_AnyMatcher<_TraitsT, true, __icase, __collate> 379*38fd1498Szrj (_M_traits)))); 380*38fd1498Szrj } 381*38fd1498Szrj 382*38fd1498Szrj template<typename _TraitsT> 383*38fd1498Szrj template<bool __icase, bool __collate> 384*38fd1498Szrj void 385*38fd1498Szrj _Compiler<_TraitsT>:: 386*38fd1498Szrj _M_insert_any_matcher_posix() 387*38fd1498Szrj { 388*38fd1498Szrj _M_stack.push(_StateSeqT(*_M_nfa, 389*38fd1498Szrj _M_nfa->_M_insert_matcher 390*38fd1498Szrj (_AnyMatcher<_TraitsT, false, __icase, __collate> 391*38fd1498Szrj (_M_traits)))); 392*38fd1498Szrj } 393*38fd1498Szrj 394*38fd1498Szrj template<typename _TraitsT> 395*38fd1498Szrj template<bool __icase, bool __collate> 396*38fd1498Szrj void 397*38fd1498Szrj _Compiler<_TraitsT>:: 398*38fd1498Szrj _M_insert_char_matcher() 399*38fd1498Szrj { 400*38fd1498Szrj _M_stack.push(_StateSeqT(*_M_nfa, 401*38fd1498Szrj _M_nfa->_M_insert_matcher 402*38fd1498Szrj (_CharMatcher<_TraitsT, __icase, __collate> 403*38fd1498Szrj (_M_value[0], _M_traits)))); 404*38fd1498Szrj } 405*38fd1498Szrj 406*38fd1498Szrj template<typename _TraitsT> 407*38fd1498Szrj template<bool __icase, bool __collate> 408*38fd1498Szrj void 409*38fd1498Szrj _Compiler<_TraitsT>:: 410*38fd1498Szrj _M_insert_character_class_matcher() 411*38fd1498Szrj { 412*38fd1498Szrj __glibcxx_assert(_M_value.size() == 1); 413*38fd1498Szrj _BracketMatcher<_TraitsT, __icase, __collate> __matcher 414*38fd1498Szrj (_M_ctype.is(_CtypeT::upper, _M_value[0]), _M_traits); 415*38fd1498Szrj __matcher._M_add_character_class(_M_value, false); 416*38fd1498Szrj __matcher._M_ready(); 417*38fd1498Szrj _M_stack.push(_StateSeqT(*_M_nfa, 418*38fd1498Szrj _M_nfa->_M_insert_matcher(std::move(__matcher)))); 419*38fd1498Szrj } 420*38fd1498Szrj 421*38fd1498Szrj template<typename _TraitsT> 422*38fd1498Szrj template<bool __icase, bool __collate> 423*38fd1498Szrj void 424*38fd1498Szrj _Compiler<_TraitsT>:: 425*38fd1498Szrj _M_insert_bracket_matcher(bool __neg) 426*38fd1498Szrj { 427*38fd1498Szrj _BracketMatcher<_TraitsT, __icase, __collate> __matcher(__neg, _M_traits); 428*38fd1498Szrj pair<bool, _CharT> __last_char; // Optional<_CharT> 429*38fd1498Szrj __last_char.first = false; 430*38fd1498Szrj if (!(_M_flags & regex_constants::ECMAScript)) 431*38fd1498Szrj { 432*38fd1498Szrj if (_M_try_char()) 433*38fd1498Szrj { 434*38fd1498Szrj __last_char.first = true; 435*38fd1498Szrj __last_char.second = _M_value[0]; 436*38fd1498Szrj } 437*38fd1498Szrj else if (_M_match_token(_ScannerT::_S_token_bracket_dash)) 438*38fd1498Szrj { 439*38fd1498Szrj __last_char.first = true; 440*38fd1498Szrj __last_char.second = '-'; 441*38fd1498Szrj } 442*38fd1498Szrj } 443*38fd1498Szrj while (_M_expression_term(__last_char, __matcher)); 444*38fd1498Szrj if (__last_char.first) 445*38fd1498Szrj __matcher._M_add_char(__last_char.second); 446*38fd1498Szrj __matcher._M_ready(); 447*38fd1498Szrj _M_stack.push(_StateSeqT( 448*38fd1498Szrj *_M_nfa, 449*38fd1498Szrj _M_nfa->_M_insert_matcher(std::move(__matcher)))); 450*38fd1498Szrj } 451*38fd1498Szrj 452*38fd1498Szrj template<typename _TraitsT> 453*38fd1498Szrj template<bool __icase, bool __collate> 454*38fd1498Szrj bool 455*38fd1498Szrj _Compiler<_TraitsT>:: 456*38fd1498Szrj _M_expression_term(pair<bool, _CharT>& __last_char, 457*38fd1498Szrj _BracketMatcher<_TraitsT, __icase, __collate>& __matcher) 458*38fd1498Szrj { 459*38fd1498Szrj if (_M_match_token(_ScannerT::_S_token_bracket_end)) 460*38fd1498Szrj return false; 461*38fd1498Szrj 462*38fd1498Szrj const auto __push_char = [&](_CharT __ch) 463*38fd1498Szrj { 464*38fd1498Szrj if (__last_char.first) 465*38fd1498Szrj __matcher._M_add_char(__last_char.second); 466*38fd1498Szrj else 467*38fd1498Szrj __last_char.first = true; 468*38fd1498Szrj __last_char.second = __ch; 469*38fd1498Szrj }; 470*38fd1498Szrj const auto __flush = [&] 471*38fd1498Szrj { 472*38fd1498Szrj if (__last_char.first) 473*38fd1498Szrj { 474*38fd1498Szrj __matcher._M_add_char(__last_char.second); 475*38fd1498Szrj __last_char.first = false; 476*38fd1498Szrj } 477*38fd1498Szrj }; 478*38fd1498Szrj 479*38fd1498Szrj if (_M_match_token(_ScannerT::_S_token_collsymbol)) 480*38fd1498Szrj { 481*38fd1498Szrj auto __symbol = __matcher._M_add_collate_element(_M_value); 482*38fd1498Szrj if (__symbol.size() == 1) 483*38fd1498Szrj __push_char(__symbol[0]); 484*38fd1498Szrj else 485*38fd1498Szrj __flush(); 486*38fd1498Szrj } 487*38fd1498Szrj else if (_M_match_token(_ScannerT::_S_token_equiv_class_name)) 488*38fd1498Szrj { 489*38fd1498Szrj __flush(); 490*38fd1498Szrj __matcher._M_add_equivalence_class(_M_value); 491*38fd1498Szrj } 492*38fd1498Szrj else if (_M_match_token(_ScannerT::_S_token_char_class_name)) 493*38fd1498Szrj { 494*38fd1498Szrj __flush(); 495*38fd1498Szrj __matcher._M_add_character_class(_M_value, false); 496*38fd1498Szrj } 497*38fd1498Szrj else if (_M_try_char()) 498*38fd1498Szrj __push_char(_M_value[0]); 499*38fd1498Szrj // POSIX doesn't allow '-' as a start-range char (say [a-z--0]), 500*38fd1498Szrj // except when the '-' is the first or last character in the bracket 501*38fd1498Szrj // expression ([--0]). ECMAScript treats all '-' after a range as a 502*38fd1498Szrj // normal character. Also see above, where _M_expression_term gets called. 503*38fd1498Szrj // 504*38fd1498Szrj // As a result, POSIX rejects [-----], but ECMAScript doesn't. 505*38fd1498Szrj // Boost (1.57.0) always uses POSIX style even in its ECMAScript syntax. 506*38fd1498Szrj // Clang (3.5) always uses ECMAScript style even in its POSIX syntax. 507*38fd1498Szrj // 508*38fd1498Szrj // It turns out that no one reads BNFs ;) 509*38fd1498Szrj else if (_M_match_token(_ScannerT::_S_token_bracket_dash)) 510*38fd1498Szrj { 511*38fd1498Szrj if (!__last_char.first) 512*38fd1498Szrj { 513*38fd1498Szrj if (!(_M_flags & regex_constants::ECMAScript)) 514*38fd1498Szrj { 515*38fd1498Szrj if (_M_match_token(_ScannerT::_S_token_bracket_end)) 516*38fd1498Szrj { 517*38fd1498Szrj __push_char('-'); 518*38fd1498Szrj return false; 519*38fd1498Szrj } 520*38fd1498Szrj __throw_regex_error( 521*38fd1498Szrj regex_constants::error_range, 522*38fd1498Szrj "Unexpected dash in bracket expression. For POSIX syntax, " 523*38fd1498Szrj "a dash is not treated literally only when it is at " 524*38fd1498Szrj "beginning or end."); 525*38fd1498Szrj } 526*38fd1498Szrj __push_char('-'); 527*38fd1498Szrj } 528*38fd1498Szrj else 529*38fd1498Szrj { 530*38fd1498Szrj if (_M_try_char()) 531*38fd1498Szrj { 532*38fd1498Szrj __matcher._M_make_range(__last_char.second, _M_value[0]); 533*38fd1498Szrj __last_char.first = false; 534*38fd1498Szrj } 535*38fd1498Szrj else if (_M_match_token(_ScannerT::_S_token_bracket_dash)) 536*38fd1498Szrj { 537*38fd1498Szrj __matcher._M_make_range(__last_char.second, '-'); 538*38fd1498Szrj __last_char.first = false; 539*38fd1498Szrj } 540*38fd1498Szrj else 541*38fd1498Szrj { 542*38fd1498Szrj if (_M_scanner._M_get_token() 543*38fd1498Szrj != _ScannerT::_S_token_bracket_end) 544*38fd1498Szrj __throw_regex_error( 545*38fd1498Szrj regex_constants::error_range, 546*38fd1498Szrj "Character is expected after a dash."); 547*38fd1498Szrj __push_char('-'); 548*38fd1498Szrj } 549*38fd1498Szrj } 550*38fd1498Szrj } 551*38fd1498Szrj else if (_M_match_token(_ScannerT::_S_token_quoted_class)) 552*38fd1498Szrj { 553*38fd1498Szrj __flush(); 554*38fd1498Szrj __matcher._M_add_character_class(_M_value, 555*38fd1498Szrj _M_ctype.is(_CtypeT::upper, 556*38fd1498Szrj _M_value[0])); 557*38fd1498Szrj } 558*38fd1498Szrj else 559*38fd1498Szrj __throw_regex_error(regex_constants::error_brack, 560*38fd1498Szrj "Unexpected character in bracket expression."); 561*38fd1498Szrj 562*38fd1498Szrj return true; 563*38fd1498Szrj } 564*38fd1498Szrj 565*38fd1498Szrj template<typename _TraitsT> 566*38fd1498Szrj bool 567*38fd1498Szrj _Compiler<_TraitsT>:: 568*38fd1498Szrj _M_try_char() 569*38fd1498Szrj { 570*38fd1498Szrj bool __is_char = false; 571*38fd1498Szrj if (_M_match_token(_ScannerT::_S_token_oct_num)) 572*38fd1498Szrj { 573*38fd1498Szrj __is_char = true; 574*38fd1498Szrj _M_value.assign(1, _M_cur_int_value(8)); 575*38fd1498Szrj } 576*38fd1498Szrj else if (_M_match_token(_ScannerT::_S_token_hex_num)) 577*38fd1498Szrj { 578*38fd1498Szrj __is_char = true; 579*38fd1498Szrj _M_value.assign(1, _M_cur_int_value(16)); 580*38fd1498Szrj } 581*38fd1498Szrj else if (_M_match_token(_ScannerT::_S_token_ord_char)) 582*38fd1498Szrj __is_char = true; 583*38fd1498Szrj return __is_char; 584*38fd1498Szrj } 585*38fd1498Szrj 586*38fd1498Szrj template<typename _TraitsT> 587*38fd1498Szrj bool 588*38fd1498Szrj _Compiler<_TraitsT>:: 589*38fd1498Szrj _M_match_token(_TokenT token) 590*38fd1498Szrj { 591*38fd1498Szrj if (token == _M_scanner._M_get_token()) 592*38fd1498Szrj { 593*38fd1498Szrj _M_value = _M_scanner._M_get_value(); 594*38fd1498Szrj _M_scanner._M_advance(); 595*38fd1498Szrj return true; 596*38fd1498Szrj } 597*38fd1498Szrj return false; 598*38fd1498Szrj } 599*38fd1498Szrj 600*38fd1498Szrj template<typename _TraitsT> 601*38fd1498Szrj int 602*38fd1498Szrj _Compiler<_TraitsT>:: 603*38fd1498Szrj _M_cur_int_value(int __radix) 604*38fd1498Szrj { 605*38fd1498Szrj long __v = 0; 606*38fd1498Szrj for (typename _StringT::size_type __i = 0; 607*38fd1498Szrj __i < _M_value.length(); ++__i) 608*38fd1498Szrj __v =__v * __radix + _M_traits.value(_M_value[__i], __radix); 609*38fd1498Szrj return __v; 610*38fd1498Szrj } 611*38fd1498Szrj 612*38fd1498Szrj template<typename _TraitsT, bool __icase, bool __collate> 613*38fd1498Szrj bool 614*38fd1498Szrj _BracketMatcher<_TraitsT, __icase, __collate>:: 615*38fd1498Szrj _M_apply(_CharT __ch, false_type) const 616*38fd1498Szrj { 617*38fd1498Szrj return [this, __ch] 618*38fd1498Szrj { 619*38fd1498Szrj if (std::binary_search(_M_char_set.begin(), _M_char_set.end(), 620*38fd1498Szrj _M_translator._M_translate(__ch))) 621*38fd1498Szrj return true; 622*38fd1498Szrj auto __s = _M_translator._M_transform(__ch); 623*38fd1498Szrj for (auto& __it : _M_range_set) 624*38fd1498Szrj if (_M_translator._M_match_range(__it.first, __it.second, __s)) 625*38fd1498Szrj return true; 626*38fd1498Szrj if (_M_traits.isctype(__ch, _M_class_set)) 627*38fd1498Szrj return true; 628*38fd1498Szrj if (std::find(_M_equiv_set.begin(), _M_equiv_set.end(), 629*38fd1498Szrj _M_traits.transform_primary(&__ch, &__ch+1)) 630*38fd1498Szrj != _M_equiv_set.end()) 631*38fd1498Szrj return true; 632*38fd1498Szrj for (auto& __it : _M_neg_class_set) 633*38fd1498Szrj if (!_M_traits.isctype(__ch, __it)) 634*38fd1498Szrj return true; 635*38fd1498Szrj return false; 636*38fd1498Szrj }() ^ _M_is_non_matching; 637*38fd1498Szrj } 638*38fd1498Szrj } // namespace __detail 639*38fd1498Szrj 640*38fd1498Szrj _GLIBCXX_END_NAMESPACE_VERSION 641*38fd1498Szrj } // namespace 642