138fd1498Szrj // class template regex -*- C++ -*- 238fd1498Szrj 338fd1498Szrj // Copyright (C) 2013-2018 Free Software Foundation, Inc. 438fd1498Szrj // 538fd1498Szrj // This file is part of the GNU ISO C++ Library. This library is free 638fd1498Szrj // software; you can redistribute it and/or modify it under the 738fd1498Szrj // terms of the GNU General Public License as published by the 838fd1498Szrj // Free Software Foundation; either version 3, or (at your option) 938fd1498Szrj // any later version. 1038fd1498Szrj 1138fd1498Szrj // This library is distributed in the hope that it will be useful, 1238fd1498Szrj // but WITHOUT ANY WARRANTY; without even the implied warranty of 1338fd1498Szrj // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 1438fd1498Szrj // GNU General Public License for more details. 1538fd1498Szrj 1638fd1498Szrj // Under Section 7 of GPL version 3, you are granted additional 1738fd1498Szrj // permissions described in the GCC Runtime Library Exception, version 1838fd1498Szrj // 3.1, as published by the Free Software Foundation. 1938fd1498Szrj 2038fd1498Szrj // You should have received a copy of the GNU General Public License and 2138fd1498Szrj // a copy of the GCC Runtime Library Exception along with this program; 2238fd1498Szrj // see the files COPYING3 and COPYING.RUNTIME respectively. If not, see 2338fd1498Szrj // <http://www.gnu.org/licenses/>. 2438fd1498Szrj 2538fd1498Szrj /** 2638fd1498Szrj * @file bits/regex_compiler.tcc 2738fd1498Szrj * This is an internal header file, included by other library headers. 2838fd1498Szrj * Do not attempt to use it directly. @headername{regex} 2938fd1498Szrj */ 3038fd1498Szrj 3138fd1498Szrj // FIXME make comments doxygen format. 3238fd1498Szrj 3338fd1498Szrj /* 3438fd1498Szrj // This compiler refers to "Regular Expression Matching Can Be Simple And Fast" 3538fd1498Szrj // (http://swtch.com/~rsc/regexp/regexp1.html), 3638fd1498Szrj // but doesn't strictly follow it. 3738fd1498Szrj // 3838fd1498Szrj // When compiling, states are *chained* instead of tree- or graph-constructed. 3938fd1498Szrj // It's more like structured programs: there's if statement and loop statement. 4038fd1498Szrj // 4138fd1498Szrj // For alternative structure (say "a|b"), aka "if statement", two branches 4238fd1498Szrj // should be constructed. However, these two shall merge to an "end_tag" at 4338fd1498Szrj // the end of this operator: 4438fd1498Szrj // 4538fd1498Szrj // branch1 4638fd1498Szrj // / \ 4738fd1498Szrj // => begin_tag end_tag => 4838fd1498Szrj // \ / 4938fd1498Szrj // branch2 5038fd1498Szrj // 5138fd1498Szrj // This is the difference between this implementation and that in Russ's 5238fd1498Szrj // article. 5338fd1498Szrj // 5438fd1498Szrj // That's why we introduced dummy node here ------ "end_tag" is a dummy node. 5538fd1498Szrj // All dummy nodes will be eliminated at the end of compilation. 5638fd1498Szrj */ 5738fd1498Szrj 5838fd1498Szrj namespace std _GLIBCXX_VISIBILITY(default) 5938fd1498Szrj { 6038fd1498Szrj _GLIBCXX_BEGIN_NAMESPACE_VERSION 6138fd1498Szrj 6238fd1498Szrj namespace __detail 6338fd1498Szrj { 6438fd1498Szrj template<typename _TraitsT> 6538fd1498Szrj _Compiler<_TraitsT>:: _Compiler(_IterT __b,_IterT __e,const typename _TraitsT::locale_type & __loc,_FlagT __flags)6638fd1498Szrj _Compiler(_IterT __b, _IterT __e, 6738fd1498Szrj const typename _TraitsT::locale_type& __loc, _FlagT __flags) 6838fd1498Szrj : _M_flags((__flags 6938fd1498Szrj & (regex_constants::ECMAScript 7038fd1498Szrj | regex_constants::basic 7138fd1498Szrj | regex_constants::extended 7238fd1498Szrj | regex_constants::grep 7338fd1498Szrj | regex_constants::egrep 7438fd1498Szrj | regex_constants::awk)) 7538fd1498Szrj ? __flags 7638fd1498Szrj : __flags | regex_constants::ECMAScript), 7738fd1498Szrj _M_scanner(__b, __e, _M_flags, __loc), 7838fd1498Szrj _M_nfa(make_shared<_RegexT>(__loc, _M_flags)), 7938fd1498Szrj _M_traits(_M_nfa->_M_traits), 8038fd1498Szrj _M_ctype(std::use_facet<_CtypeT>(__loc)) 8138fd1498Szrj { 8238fd1498Szrj _StateSeqT __r(*_M_nfa, _M_nfa->_M_start()); 8338fd1498Szrj __r._M_append(_M_nfa->_M_insert_subexpr_begin()); 8438fd1498Szrj this->_M_disjunction(); 8538fd1498Szrj if (!_M_match_token(_ScannerT::_S_token_eof)) 8638fd1498Szrj __throw_regex_error(regex_constants::error_paren); 8738fd1498Szrj __r._M_append(_M_pop()); 8838fd1498Szrj __glibcxx_assert(_M_stack.empty()); 8938fd1498Szrj __r._M_append(_M_nfa->_M_insert_subexpr_end()); 9038fd1498Szrj __r._M_append(_M_nfa->_M_insert_accept()); 9138fd1498Szrj _M_nfa->_M_eliminate_dummy(); 9238fd1498Szrj } 9338fd1498Szrj 9438fd1498Szrj template<typename _TraitsT> 9538fd1498Szrj void 9638fd1498Szrj _Compiler<_TraitsT>:: _M_disjunction()9738fd1498Szrj _M_disjunction() 9838fd1498Szrj { 9938fd1498Szrj this->_M_alternative(); 10038fd1498Szrj while (_M_match_token(_ScannerT::_S_token_or)) 10138fd1498Szrj { 10238fd1498Szrj _StateSeqT __alt1 = _M_pop(); 10338fd1498Szrj this->_M_alternative(); 10438fd1498Szrj _StateSeqT __alt2 = _M_pop(); 10538fd1498Szrj auto __end = _M_nfa->_M_insert_dummy(); 10638fd1498Szrj __alt1._M_append(__end); 10738fd1498Szrj __alt2._M_append(__end); 10838fd1498Szrj // __alt2 is state._M_next, __alt1 is state._M_alt. The executor 10938fd1498Szrj // executes _M_alt before _M_next, as well as executing left 11038fd1498Szrj // alternative before right one. 11138fd1498Szrj _M_stack.push(_StateSeqT(*_M_nfa, 11238fd1498Szrj _M_nfa->_M_insert_alt( 11338fd1498Szrj __alt2._M_start, __alt1._M_start, false), 11438fd1498Szrj __end)); 11538fd1498Szrj } 11638fd1498Szrj } 11738fd1498Szrj 11838fd1498Szrj template<typename _TraitsT> 11938fd1498Szrj void 12038fd1498Szrj _Compiler<_TraitsT>:: _M_alternative()12138fd1498Szrj _M_alternative() 12238fd1498Szrj { 12338fd1498Szrj if (this->_M_term()) 12438fd1498Szrj { 12538fd1498Szrj _StateSeqT __re = _M_pop(); 12638fd1498Szrj this->_M_alternative(); 12738fd1498Szrj __re._M_append(_M_pop()); 12838fd1498Szrj _M_stack.push(__re); 12938fd1498Szrj } 13038fd1498Szrj else 13138fd1498Szrj _M_stack.push(_StateSeqT(*_M_nfa, _M_nfa->_M_insert_dummy())); 13238fd1498Szrj } 13338fd1498Szrj 13438fd1498Szrj template<typename _TraitsT> 13538fd1498Szrj bool 13638fd1498Szrj _Compiler<_TraitsT>:: _M_term()13738fd1498Szrj _M_term() 13838fd1498Szrj { 13938fd1498Szrj if (this->_M_assertion()) 14038fd1498Szrj return true; 14138fd1498Szrj if (this->_M_atom()) 14238fd1498Szrj { 14338fd1498Szrj while (this->_M_quantifier()); 14438fd1498Szrj return true; 14538fd1498Szrj } 14638fd1498Szrj return false; 14738fd1498Szrj } 14838fd1498Szrj 14938fd1498Szrj template<typename _TraitsT> 15038fd1498Szrj bool 15138fd1498Szrj _Compiler<_TraitsT>:: _M_assertion()15238fd1498Szrj _M_assertion() 15338fd1498Szrj { 15438fd1498Szrj if (_M_match_token(_ScannerT::_S_token_line_begin)) 15538fd1498Szrj _M_stack.push(_StateSeqT(*_M_nfa, _M_nfa->_M_insert_line_begin())); 15638fd1498Szrj else if (_M_match_token(_ScannerT::_S_token_line_end)) 15738fd1498Szrj _M_stack.push(_StateSeqT(*_M_nfa, _M_nfa->_M_insert_line_end())); 15838fd1498Szrj else if (_M_match_token(_ScannerT::_S_token_word_bound)) 15938fd1498Szrj // _M_value[0] == 'n' means it's negative, say "not word boundary". 16038fd1498Szrj _M_stack.push(_StateSeqT(*_M_nfa, _M_nfa-> 16138fd1498Szrj _M_insert_word_bound(_M_value[0] == 'n'))); 16238fd1498Szrj else if (_M_match_token(_ScannerT::_S_token_subexpr_lookahead_begin)) 16338fd1498Szrj { 16438fd1498Szrj auto __neg = _M_value[0] == 'n'; 16538fd1498Szrj this->_M_disjunction(); 16638fd1498Szrj if (!_M_match_token(_ScannerT::_S_token_subexpr_end)) 16738fd1498Szrj __throw_regex_error(regex_constants::error_paren, 16838fd1498Szrj "Parenthesis is not closed."); 16938fd1498Szrj auto __tmp = _M_pop(); 17038fd1498Szrj __tmp._M_append(_M_nfa->_M_insert_accept()); 17138fd1498Szrj _M_stack.push( 17238fd1498Szrj _StateSeqT( 17338fd1498Szrj *_M_nfa, 17438fd1498Szrj _M_nfa->_M_insert_lookahead(__tmp._M_start, __neg))); 17538fd1498Szrj } 17638fd1498Szrj else 17738fd1498Szrj return false; 17838fd1498Szrj return true; 17938fd1498Szrj } 18038fd1498Szrj 18138fd1498Szrj template<typename _TraitsT> 18238fd1498Szrj bool 18338fd1498Szrj _Compiler<_TraitsT>:: _M_quantifier()18438fd1498Szrj _M_quantifier() 18538fd1498Szrj { 18638fd1498Szrj bool __neg = (_M_flags & regex_constants::ECMAScript); 18738fd1498Szrj auto __init = [this, &__neg]() 18838fd1498Szrj { 18938fd1498Szrj if (_M_stack.empty()) 19038fd1498Szrj __throw_regex_error(regex_constants::error_badrepeat, 19138fd1498Szrj "Nothing to repeat before a quantifier."); 19238fd1498Szrj __neg = __neg && _M_match_token(_ScannerT::_S_token_opt); 19338fd1498Szrj }; 19438fd1498Szrj if (_M_match_token(_ScannerT::_S_token_closure0)) 19538fd1498Szrj { 19638fd1498Szrj __init(); 19738fd1498Szrj auto __e = _M_pop(); 19838fd1498Szrj _StateSeqT __r(*_M_nfa, 19938fd1498Szrj _M_nfa->_M_insert_repeat(_S_invalid_state_id, 20038fd1498Szrj __e._M_start, __neg)); 20138fd1498Szrj __e._M_append(__r); 20238fd1498Szrj _M_stack.push(__r); 20338fd1498Szrj } 20438fd1498Szrj else if (_M_match_token(_ScannerT::_S_token_closure1)) 20538fd1498Szrj { 20638fd1498Szrj __init(); 20738fd1498Szrj auto __e = _M_pop(); 20838fd1498Szrj __e._M_append(_M_nfa->_M_insert_repeat(_S_invalid_state_id, 20938fd1498Szrj __e._M_start, __neg)); 21038fd1498Szrj _M_stack.push(__e); 21138fd1498Szrj } 21238fd1498Szrj else if (_M_match_token(_ScannerT::_S_token_opt)) 21338fd1498Szrj { 21438fd1498Szrj __init(); 21538fd1498Szrj auto __e = _M_pop(); 21638fd1498Szrj auto __end = _M_nfa->_M_insert_dummy(); 21738fd1498Szrj _StateSeqT __r(*_M_nfa, 21838fd1498Szrj _M_nfa->_M_insert_repeat(_S_invalid_state_id, 21938fd1498Szrj __e._M_start, __neg)); 22038fd1498Szrj __e._M_append(__end); 22138fd1498Szrj __r._M_append(__end); 22238fd1498Szrj _M_stack.push(__r); 22338fd1498Szrj } 22438fd1498Szrj else if (_M_match_token(_ScannerT::_S_token_interval_begin)) 22538fd1498Szrj { 22638fd1498Szrj if (_M_stack.empty()) 22738fd1498Szrj __throw_regex_error(regex_constants::error_badrepeat, 22838fd1498Szrj "Nothing to repeat before a quantifier."); 22938fd1498Szrj if (!_M_match_token(_ScannerT::_S_token_dup_count)) 23038fd1498Szrj __throw_regex_error(regex_constants::error_badbrace, 23138fd1498Szrj "Unexpected token in brace expression."); 23238fd1498Szrj _StateSeqT __r(_M_pop()); 23338fd1498Szrj _StateSeqT __e(*_M_nfa, _M_nfa->_M_insert_dummy()); 23438fd1498Szrj long __min_rep = _M_cur_int_value(10); 23538fd1498Szrj bool __infi = false; 23638fd1498Szrj long __n; 23738fd1498Szrj 23838fd1498Szrj // {3 23938fd1498Szrj if (_M_match_token(_ScannerT::_S_token_comma)) 24038fd1498Szrj if (_M_match_token(_ScannerT::_S_token_dup_count)) // {3,7} 24138fd1498Szrj __n = _M_cur_int_value(10) - __min_rep; 24238fd1498Szrj else 24338fd1498Szrj __infi = true; 24438fd1498Szrj else 24538fd1498Szrj __n = 0; 24638fd1498Szrj if (!_M_match_token(_ScannerT::_S_token_interval_end)) 24738fd1498Szrj __throw_regex_error(regex_constants::error_brace, 24838fd1498Szrj "Unexpected end of brace expression."); 24938fd1498Szrj 25038fd1498Szrj __neg = __neg && _M_match_token(_ScannerT::_S_token_opt); 25138fd1498Szrj 25238fd1498Szrj for (long __i = 0; __i < __min_rep; ++__i) 25338fd1498Szrj __e._M_append(__r._M_clone()); 25438fd1498Szrj 25538fd1498Szrj if (__infi) 25638fd1498Szrj { 25738fd1498Szrj auto __tmp = __r._M_clone(); 25838fd1498Szrj _StateSeqT __s(*_M_nfa, 25938fd1498Szrj _M_nfa->_M_insert_repeat(_S_invalid_state_id, 26038fd1498Szrj __tmp._M_start, __neg)); 26138fd1498Szrj __tmp._M_append(__s); 26238fd1498Szrj __e._M_append(__s); 26338fd1498Szrj } 26438fd1498Szrj else 26538fd1498Szrj { 26638fd1498Szrj if (__n < 0) 26738fd1498Szrj __throw_regex_error(regex_constants::error_badbrace, 26838fd1498Szrj "Invalid range in brace expression."); 26938fd1498Szrj auto __end = _M_nfa->_M_insert_dummy(); 27038fd1498Szrj // _M_alt is the "match more" branch, and _M_next is the 27138fd1498Szrj // "match less" one. Switch _M_alt and _M_next of all created 27238fd1498Szrj // nodes. This is a hack but IMO works well. 27338fd1498Szrj std::stack<_StateIdT> __stack; 27438fd1498Szrj for (long __i = 0; __i < __n; ++__i) 27538fd1498Szrj { 27638fd1498Szrj auto __tmp = __r._M_clone(); 27738fd1498Szrj auto __alt = _M_nfa->_M_insert_repeat(__tmp._M_start, 27838fd1498Szrj __end, __neg); 27938fd1498Szrj __stack.push(__alt); 28038fd1498Szrj __e._M_append(_StateSeqT(*_M_nfa, __alt, __tmp._M_end)); 28138fd1498Szrj } 28238fd1498Szrj __e._M_append(__end); 28338fd1498Szrj while (!__stack.empty()) 28438fd1498Szrj { 28538fd1498Szrj auto& __tmp = (*_M_nfa)[__stack.top()]; 28638fd1498Szrj __stack.pop(); 28738fd1498Szrj std::swap(__tmp._M_next, __tmp._M_alt); 28838fd1498Szrj } 28938fd1498Szrj } 29038fd1498Szrj _M_stack.push(__e); 29138fd1498Szrj } 29238fd1498Szrj else 29338fd1498Szrj return false; 29438fd1498Szrj return true; 29538fd1498Szrj } 29638fd1498Szrj 29738fd1498Szrj #define __INSERT_REGEX_MATCHER(__func, ...)\ 298*58e805e6Szrj do {\ 29938fd1498Szrj if (!(_M_flags & regex_constants::icase))\ 30038fd1498Szrj if (!(_M_flags & regex_constants::collate))\ 30138fd1498Szrj __func<false, false>(__VA_ARGS__);\ 30238fd1498Szrj else\ 30338fd1498Szrj __func<false, true>(__VA_ARGS__);\ 30438fd1498Szrj else\ 30538fd1498Szrj if (!(_M_flags & regex_constants::collate))\ 30638fd1498Szrj __func<true, false>(__VA_ARGS__);\ 30738fd1498Szrj else\ 30838fd1498Szrj __func<true, true>(__VA_ARGS__);\ 309*58e805e6Szrj } while (false) 31038fd1498Szrj 31138fd1498Szrj template<typename _TraitsT> 31238fd1498Szrj bool 31338fd1498Szrj _Compiler<_TraitsT>:: _M_atom()31438fd1498Szrj _M_atom() 31538fd1498Szrj { 31638fd1498Szrj if (_M_match_token(_ScannerT::_S_token_anychar)) 31738fd1498Szrj { 31838fd1498Szrj if (!(_M_flags & regex_constants::ECMAScript)) 31938fd1498Szrj __INSERT_REGEX_MATCHER(_M_insert_any_matcher_posix); 32038fd1498Szrj else 32138fd1498Szrj __INSERT_REGEX_MATCHER(_M_insert_any_matcher_ecma); 32238fd1498Szrj } 32338fd1498Szrj else if (_M_try_char()) 32438fd1498Szrj __INSERT_REGEX_MATCHER(_M_insert_char_matcher); 32538fd1498Szrj else if (_M_match_token(_ScannerT::_S_token_backref)) 32638fd1498Szrj _M_stack.push(_StateSeqT(*_M_nfa, _M_nfa-> 32738fd1498Szrj _M_insert_backref(_M_cur_int_value(10)))); 32838fd1498Szrj else if (_M_match_token(_ScannerT::_S_token_quoted_class)) 32938fd1498Szrj __INSERT_REGEX_MATCHER(_M_insert_character_class_matcher); 33038fd1498Szrj else if (_M_match_token(_ScannerT::_S_token_subexpr_no_group_begin)) 33138fd1498Szrj { 33238fd1498Szrj _StateSeqT __r(*_M_nfa, _M_nfa->_M_insert_dummy()); 33338fd1498Szrj this->_M_disjunction(); 33438fd1498Szrj if (!_M_match_token(_ScannerT::_S_token_subexpr_end)) 33538fd1498Szrj __throw_regex_error(regex_constants::error_paren, 33638fd1498Szrj "Parenthesis is not closed."); 33738fd1498Szrj __r._M_append(_M_pop()); 33838fd1498Szrj _M_stack.push(__r); 33938fd1498Szrj } 34038fd1498Szrj else if (_M_match_token(_ScannerT::_S_token_subexpr_begin)) 34138fd1498Szrj { 34238fd1498Szrj _StateSeqT __r(*_M_nfa, _M_nfa->_M_insert_subexpr_begin()); 34338fd1498Szrj this->_M_disjunction(); 34438fd1498Szrj if (!_M_match_token(_ScannerT::_S_token_subexpr_end)) 34538fd1498Szrj __throw_regex_error(regex_constants::error_paren, 34638fd1498Szrj "Parenthesis is not closed."); 34738fd1498Szrj __r._M_append(_M_pop()); 34838fd1498Szrj __r._M_append(_M_nfa->_M_insert_subexpr_end()); 34938fd1498Szrj _M_stack.push(__r); 35038fd1498Szrj } 35138fd1498Szrj else if (!_M_bracket_expression()) 35238fd1498Szrj return false; 35338fd1498Szrj return true; 35438fd1498Szrj } 35538fd1498Szrj 35638fd1498Szrj template<typename _TraitsT> 35738fd1498Szrj bool 35838fd1498Szrj _Compiler<_TraitsT>:: _M_bracket_expression()35938fd1498Szrj _M_bracket_expression() 36038fd1498Szrj { 36138fd1498Szrj bool __neg = 36238fd1498Szrj _M_match_token(_ScannerT::_S_token_bracket_neg_begin); 36338fd1498Szrj if (!(__neg || _M_match_token(_ScannerT::_S_token_bracket_begin))) 36438fd1498Szrj return false; 36538fd1498Szrj __INSERT_REGEX_MATCHER(_M_insert_bracket_matcher, __neg); 36638fd1498Szrj return true; 36738fd1498Szrj } 36838fd1498Szrj #undef __INSERT_REGEX_MATCHER 36938fd1498Szrj 37038fd1498Szrj template<typename _TraitsT> 37138fd1498Szrj template<bool __icase, bool __collate> 37238fd1498Szrj void 37338fd1498Szrj _Compiler<_TraitsT>:: _M_insert_any_matcher_ecma()37438fd1498Szrj _M_insert_any_matcher_ecma() 37538fd1498Szrj { 37638fd1498Szrj _M_stack.push(_StateSeqT(*_M_nfa, 37738fd1498Szrj _M_nfa->_M_insert_matcher 37838fd1498Szrj (_AnyMatcher<_TraitsT, true, __icase, __collate> 37938fd1498Szrj (_M_traits)))); 38038fd1498Szrj } 38138fd1498Szrj 38238fd1498Szrj template<typename _TraitsT> 38338fd1498Szrj template<bool __icase, bool __collate> 38438fd1498Szrj void 38538fd1498Szrj _Compiler<_TraitsT>:: _M_insert_any_matcher_posix()38638fd1498Szrj _M_insert_any_matcher_posix() 38738fd1498Szrj { 38838fd1498Szrj _M_stack.push(_StateSeqT(*_M_nfa, 38938fd1498Szrj _M_nfa->_M_insert_matcher 39038fd1498Szrj (_AnyMatcher<_TraitsT, false, __icase, __collate> 39138fd1498Szrj (_M_traits)))); 39238fd1498Szrj } 39338fd1498Szrj 39438fd1498Szrj template<typename _TraitsT> 39538fd1498Szrj template<bool __icase, bool __collate> 39638fd1498Szrj void 39738fd1498Szrj _Compiler<_TraitsT>:: _M_insert_char_matcher()39838fd1498Szrj _M_insert_char_matcher() 39938fd1498Szrj { 40038fd1498Szrj _M_stack.push(_StateSeqT(*_M_nfa, 40138fd1498Szrj _M_nfa->_M_insert_matcher 40238fd1498Szrj (_CharMatcher<_TraitsT, __icase, __collate> 40338fd1498Szrj (_M_value[0], _M_traits)))); 40438fd1498Szrj } 40538fd1498Szrj 40638fd1498Szrj template<typename _TraitsT> 40738fd1498Szrj template<bool __icase, bool __collate> 40838fd1498Szrj void 40938fd1498Szrj _Compiler<_TraitsT>:: _M_insert_character_class_matcher()41038fd1498Szrj _M_insert_character_class_matcher() 41138fd1498Szrj { 41238fd1498Szrj __glibcxx_assert(_M_value.size() == 1); 41338fd1498Szrj _BracketMatcher<_TraitsT, __icase, __collate> __matcher 41438fd1498Szrj (_M_ctype.is(_CtypeT::upper, _M_value[0]), _M_traits); 41538fd1498Szrj __matcher._M_add_character_class(_M_value, false); 41638fd1498Szrj __matcher._M_ready(); 41738fd1498Szrj _M_stack.push(_StateSeqT(*_M_nfa, 41838fd1498Szrj _M_nfa->_M_insert_matcher(std::move(__matcher)))); 41938fd1498Szrj } 42038fd1498Szrj 42138fd1498Szrj template<typename _TraitsT> 42238fd1498Szrj template<bool __icase, bool __collate> 42338fd1498Szrj void 42438fd1498Szrj _Compiler<_TraitsT>:: _M_insert_bracket_matcher(bool __neg)42538fd1498Szrj _M_insert_bracket_matcher(bool __neg) 42638fd1498Szrj { 42738fd1498Szrj _BracketMatcher<_TraitsT, __icase, __collate> __matcher(__neg, _M_traits); 42838fd1498Szrj pair<bool, _CharT> __last_char; // Optional<_CharT> 42938fd1498Szrj __last_char.first = false; 43038fd1498Szrj if (!(_M_flags & regex_constants::ECMAScript)) 43138fd1498Szrj { 43238fd1498Szrj if (_M_try_char()) 43338fd1498Szrj { 43438fd1498Szrj __last_char.first = true; 43538fd1498Szrj __last_char.second = _M_value[0]; 43638fd1498Szrj } 43738fd1498Szrj else if (_M_match_token(_ScannerT::_S_token_bracket_dash)) 43838fd1498Szrj { 43938fd1498Szrj __last_char.first = true; 44038fd1498Szrj __last_char.second = '-'; 44138fd1498Szrj } 44238fd1498Szrj } 44338fd1498Szrj while (_M_expression_term(__last_char, __matcher)); 44438fd1498Szrj if (__last_char.first) 44538fd1498Szrj __matcher._M_add_char(__last_char.second); 44638fd1498Szrj __matcher._M_ready(); 44738fd1498Szrj _M_stack.push(_StateSeqT( 44838fd1498Szrj *_M_nfa, 44938fd1498Szrj _M_nfa->_M_insert_matcher(std::move(__matcher)))); 45038fd1498Szrj } 45138fd1498Szrj 45238fd1498Szrj template<typename _TraitsT> 45338fd1498Szrj template<bool __icase, bool __collate> 45438fd1498Szrj bool 45538fd1498Szrj _Compiler<_TraitsT>:: _M_expression_term(pair<bool,_CharT> & __last_char,_BracketMatcher<_TraitsT,__icase,__collate> & __matcher)45638fd1498Szrj _M_expression_term(pair<bool, _CharT>& __last_char, 45738fd1498Szrj _BracketMatcher<_TraitsT, __icase, __collate>& __matcher) 45838fd1498Szrj { 45938fd1498Szrj if (_M_match_token(_ScannerT::_S_token_bracket_end)) 46038fd1498Szrj return false; 46138fd1498Szrj 46238fd1498Szrj const auto __push_char = [&](_CharT __ch) 46338fd1498Szrj { 46438fd1498Szrj if (__last_char.first) 46538fd1498Szrj __matcher._M_add_char(__last_char.second); 46638fd1498Szrj else 46738fd1498Szrj __last_char.first = true; 46838fd1498Szrj __last_char.second = __ch; 46938fd1498Szrj }; 47038fd1498Szrj const auto __flush = [&] 47138fd1498Szrj { 47238fd1498Szrj if (__last_char.first) 47338fd1498Szrj { 47438fd1498Szrj __matcher._M_add_char(__last_char.second); 47538fd1498Szrj __last_char.first = false; 47638fd1498Szrj } 47738fd1498Szrj }; 47838fd1498Szrj 47938fd1498Szrj if (_M_match_token(_ScannerT::_S_token_collsymbol)) 48038fd1498Szrj { 48138fd1498Szrj auto __symbol = __matcher._M_add_collate_element(_M_value); 48238fd1498Szrj if (__symbol.size() == 1) 48338fd1498Szrj __push_char(__symbol[0]); 48438fd1498Szrj else 48538fd1498Szrj __flush(); 48638fd1498Szrj } 48738fd1498Szrj else if (_M_match_token(_ScannerT::_S_token_equiv_class_name)) 48838fd1498Szrj { 48938fd1498Szrj __flush(); 49038fd1498Szrj __matcher._M_add_equivalence_class(_M_value); 49138fd1498Szrj } 49238fd1498Szrj else if (_M_match_token(_ScannerT::_S_token_char_class_name)) 49338fd1498Szrj { 49438fd1498Szrj __flush(); 49538fd1498Szrj __matcher._M_add_character_class(_M_value, false); 49638fd1498Szrj } 49738fd1498Szrj else if (_M_try_char()) 49838fd1498Szrj __push_char(_M_value[0]); 49938fd1498Szrj // POSIX doesn't allow '-' as a start-range char (say [a-z--0]), 50038fd1498Szrj // except when the '-' is the first or last character in the bracket 50138fd1498Szrj // expression ([--0]). ECMAScript treats all '-' after a range as a 50238fd1498Szrj // normal character. Also see above, where _M_expression_term gets called. 50338fd1498Szrj // 50438fd1498Szrj // As a result, POSIX rejects [-----], but ECMAScript doesn't. 50538fd1498Szrj // Boost (1.57.0) always uses POSIX style even in its ECMAScript syntax. 50638fd1498Szrj // Clang (3.5) always uses ECMAScript style even in its POSIX syntax. 50738fd1498Szrj // 50838fd1498Szrj // It turns out that no one reads BNFs ;) 50938fd1498Szrj else if (_M_match_token(_ScannerT::_S_token_bracket_dash)) 51038fd1498Szrj { 51138fd1498Szrj if (!__last_char.first) 51238fd1498Szrj { 51338fd1498Szrj if (!(_M_flags & regex_constants::ECMAScript)) 51438fd1498Szrj { 51538fd1498Szrj if (_M_match_token(_ScannerT::_S_token_bracket_end)) 51638fd1498Szrj { 51738fd1498Szrj __push_char('-'); 51838fd1498Szrj return false; 51938fd1498Szrj } 52038fd1498Szrj __throw_regex_error( 52138fd1498Szrj regex_constants::error_range, 52238fd1498Szrj "Unexpected dash in bracket expression. For POSIX syntax, " 52338fd1498Szrj "a dash is not treated literally only when it is at " 52438fd1498Szrj "beginning or end."); 52538fd1498Szrj } 52638fd1498Szrj __push_char('-'); 52738fd1498Szrj } 52838fd1498Szrj else 52938fd1498Szrj { 53038fd1498Szrj if (_M_try_char()) 53138fd1498Szrj { 53238fd1498Szrj __matcher._M_make_range(__last_char.second, _M_value[0]); 53338fd1498Szrj __last_char.first = false; 53438fd1498Szrj } 53538fd1498Szrj else if (_M_match_token(_ScannerT::_S_token_bracket_dash)) 53638fd1498Szrj { 53738fd1498Szrj __matcher._M_make_range(__last_char.second, '-'); 53838fd1498Szrj __last_char.first = false; 53938fd1498Szrj } 54038fd1498Szrj else 54138fd1498Szrj { 54238fd1498Szrj if (_M_scanner._M_get_token() 54338fd1498Szrj != _ScannerT::_S_token_bracket_end) 54438fd1498Szrj __throw_regex_error( 54538fd1498Szrj regex_constants::error_range, 54638fd1498Szrj "Character is expected after a dash."); 54738fd1498Szrj __push_char('-'); 54838fd1498Szrj } 54938fd1498Szrj } 55038fd1498Szrj } 55138fd1498Szrj else if (_M_match_token(_ScannerT::_S_token_quoted_class)) 55238fd1498Szrj { 55338fd1498Szrj __flush(); 55438fd1498Szrj __matcher._M_add_character_class(_M_value, 55538fd1498Szrj _M_ctype.is(_CtypeT::upper, 55638fd1498Szrj _M_value[0])); 55738fd1498Szrj } 55838fd1498Szrj else 55938fd1498Szrj __throw_regex_error(regex_constants::error_brack, 56038fd1498Szrj "Unexpected character in bracket expression."); 56138fd1498Szrj 56238fd1498Szrj return true; 56338fd1498Szrj } 56438fd1498Szrj 56538fd1498Szrj template<typename _TraitsT> 56638fd1498Szrj bool 56738fd1498Szrj _Compiler<_TraitsT>:: _M_try_char()56838fd1498Szrj _M_try_char() 56938fd1498Szrj { 57038fd1498Szrj bool __is_char = false; 57138fd1498Szrj if (_M_match_token(_ScannerT::_S_token_oct_num)) 57238fd1498Szrj { 57338fd1498Szrj __is_char = true; 57438fd1498Szrj _M_value.assign(1, _M_cur_int_value(8)); 57538fd1498Szrj } 57638fd1498Szrj else if (_M_match_token(_ScannerT::_S_token_hex_num)) 57738fd1498Szrj { 57838fd1498Szrj __is_char = true; 57938fd1498Szrj _M_value.assign(1, _M_cur_int_value(16)); 58038fd1498Szrj } 58138fd1498Szrj else if (_M_match_token(_ScannerT::_S_token_ord_char)) 58238fd1498Szrj __is_char = true; 58338fd1498Szrj return __is_char; 58438fd1498Szrj } 58538fd1498Szrj 58638fd1498Szrj template<typename _TraitsT> 58738fd1498Szrj bool 58838fd1498Szrj _Compiler<_TraitsT>:: _M_match_token(_TokenT token)58938fd1498Szrj _M_match_token(_TokenT token) 59038fd1498Szrj { 59138fd1498Szrj if (token == _M_scanner._M_get_token()) 59238fd1498Szrj { 59338fd1498Szrj _M_value = _M_scanner._M_get_value(); 59438fd1498Szrj _M_scanner._M_advance(); 59538fd1498Szrj return true; 59638fd1498Szrj } 59738fd1498Szrj return false; 59838fd1498Szrj } 59938fd1498Szrj 60038fd1498Szrj template<typename _TraitsT> 60138fd1498Szrj int 60238fd1498Szrj _Compiler<_TraitsT>:: _M_cur_int_value(int __radix)60338fd1498Szrj _M_cur_int_value(int __radix) 60438fd1498Szrj { 60538fd1498Szrj long __v = 0; 60638fd1498Szrj for (typename _StringT::size_type __i = 0; 60738fd1498Szrj __i < _M_value.length(); ++__i) 60838fd1498Szrj __v =__v * __radix + _M_traits.value(_M_value[__i], __radix); 60938fd1498Szrj return __v; 61038fd1498Szrj } 61138fd1498Szrj 61238fd1498Szrj template<typename _TraitsT, bool __icase, bool __collate> 61338fd1498Szrj bool 61438fd1498Szrj _BracketMatcher<_TraitsT, __icase, __collate>:: _M_apply(_CharT __ch,false_type) const61538fd1498Szrj _M_apply(_CharT __ch, false_type) const 61638fd1498Szrj { 61738fd1498Szrj return [this, __ch] 61838fd1498Szrj { 61938fd1498Szrj if (std::binary_search(_M_char_set.begin(), _M_char_set.end(), 62038fd1498Szrj _M_translator._M_translate(__ch))) 62138fd1498Szrj return true; 62238fd1498Szrj auto __s = _M_translator._M_transform(__ch); 62338fd1498Szrj for (auto& __it : _M_range_set) 62438fd1498Szrj if (_M_translator._M_match_range(__it.first, __it.second, __s)) 62538fd1498Szrj return true; 62638fd1498Szrj if (_M_traits.isctype(__ch, _M_class_set)) 62738fd1498Szrj return true; 62838fd1498Szrj if (std::find(_M_equiv_set.begin(), _M_equiv_set.end(), 62938fd1498Szrj _M_traits.transform_primary(&__ch, &__ch+1)) 63038fd1498Szrj != _M_equiv_set.end()) 63138fd1498Szrj return true; 63238fd1498Szrj for (auto& __it : _M_neg_class_set) 63338fd1498Szrj if (!_M_traits.isctype(__ch, __it)) 63438fd1498Szrj return true; 63538fd1498Szrj return false; 63638fd1498Szrj }() ^ _M_is_non_matching; 63738fd1498Szrj } 63838fd1498Szrj } // namespace __detail 63938fd1498Szrj 64038fd1498Szrj _GLIBCXX_END_NAMESPACE_VERSION 64138fd1498Szrj } // namespace 642