1*38fd1498Szrj // class template regex -*- C++ -*- 2*38fd1498Szrj 3*38fd1498Szrj // Copyright (C) 2013-2018 Free Software Foundation, Inc. 4*38fd1498Szrj // 5*38fd1498Szrj // This file is part of the GNU ISO C++ Library. This library is free 6*38fd1498Szrj // software; you can redistribute it and/or modify it under the 7*38fd1498Szrj // terms of the GNU General Public License as published by the 8*38fd1498Szrj // Free Software Foundation; either version 3, or (at your option) 9*38fd1498Szrj // any later version. 10*38fd1498Szrj 11*38fd1498Szrj // This library is distributed in the hope that it will be useful, 12*38fd1498Szrj // but WITHOUT ANY WARRANTY; without even the implied warranty of 13*38fd1498Szrj // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 14*38fd1498Szrj // GNU General Public License for more details. 15*38fd1498Szrj 16*38fd1498Szrj // Under Section 7 of GPL version 3, you are granted additional 17*38fd1498Szrj // permissions described in the GCC Runtime Library Exception, version 18*38fd1498Szrj // 3.1, as published by the Free Software Foundation. 19*38fd1498Szrj 20*38fd1498Szrj // You should have received a copy of the GNU General Public License and 21*38fd1498Szrj // a copy of the GCC Runtime Library Exception along with this program; 22*38fd1498Szrj // see the files COPYING3 and COPYING.RUNTIME respectively. If not, see 23*38fd1498Szrj // <http://www.gnu.org/licenses/>. 24*38fd1498Szrj 25*38fd1498Szrj /** 26*38fd1498Szrj * @file bits/regex_automaton.h 27*38fd1498Szrj * This is an internal header file, included by other library headers. 28*38fd1498Szrj * Do not attempt to use it directly. @headername{regex} 29*38fd1498Szrj */ 30*38fd1498Szrj 31*38fd1498Szrj // This macro defines the maximal state number a NFA can have. 32*38fd1498Szrj #ifndef _GLIBCXX_REGEX_STATE_LIMIT 33*38fd1498Szrj #define _GLIBCXX_REGEX_STATE_LIMIT 100000 34*38fd1498Szrj #endif 35*38fd1498Szrj 36*38fd1498Szrj namespace std _GLIBCXX_VISIBILITY(default) 37*38fd1498Szrj { 38*38fd1498Szrj _GLIBCXX_BEGIN_NAMESPACE_VERSION 39*38fd1498Szrj 40*38fd1498Szrj namespace __detail 41*38fd1498Szrj { 42*38fd1498Szrj /** 43*38fd1498Szrj * @defgroup regex-detail Base and Implementation Classes 44*38fd1498Szrj * @ingroup regex 45*38fd1498Szrj * @{ 46*38fd1498Szrj */ 47*38fd1498Szrj 48*38fd1498Szrj typedef long _StateIdT; 49*38fd1498Szrj static const _StateIdT _S_invalid_state_id = -1; 50*38fd1498Szrj 51*38fd1498Szrj template<typename _CharT> 52*38fd1498Szrj using _Matcher = std::function<bool (_CharT)>; 53*38fd1498Szrj 54*38fd1498Szrj /// Operation codes that define the type of transitions within the base NFA 55*38fd1498Szrj /// that represents the regular expression. 56*38fd1498Szrj enum _Opcode : int 57*38fd1498Szrj { 58*38fd1498Szrj _S_opcode_unknown, 59*38fd1498Szrj _S_opcode_alternative, 60*38fd1498Szrj _S_opcode_repeat, 61*38fd1498Szrj _S_opcode_backref, 62*38fd1498Szrj _S_opcode_line_begin_assertion, 63*38fd1498Szrj _S_opcode_line_end_assertion, 64*38fd1498Szrj _S_opcode_word_boundary, 65*38fd1498Szrj _S_opcode_subexpr_lookahead, 66*38fd1498Szrj _S_opcode_subexpr_begin, 67*38fd1498Szrj _S_opcode_subexpr_end, 68*38fd1498Szrj _S_opcode_dummy, 69*38fd1498Szrj _S_opcode_match, 70*38fd1498Szrj _S_opcode_accept, 71*38fd1498Szrj }; 72*38fd1498Szrj 73*38fd1498Szrj struct _State_base 74*38fd1498Szrj { 75*38fd1498Szrj protected: 76*38fd1498Szrj _Opcode _M_opcode; // type of outgoing transition 77*38fd1498Szrj 78*38fd1498Szrj public: 79*38fd1498Szrj _StateIdT _M_next; // outgoing transition 80*38fd1498Szrj union // Since they are mutually exclusive. 81*38fd1498Szrj { 82*38fd1498Szrj size_t _M_subexpr; // for _S_opcode_subexpr_* 83*38fd1498Szrj size_t _M_backref_index; // for _S_opcode_backref 84*38fd1498Szrj struct 85*38fd1498Szrj { 86*38fd1498Szrj // for _S_opcode_alternative, _S_opcode_repeat and 87*38fd1498Szrj // _S_opcode_subexpr_lookahead 88*38fd1498Szrj _StateIdT _M_alt; 89*38fd1498Szrj // for _S_opcode_word_boundary or _S_opcode_subexpr_lookahead or 90*38fd1498Szrj // quantifiers (ungreedy if set true) 91*38fd1498Szrj bool _M_neg; 92*38fd1498Szrj }; 93*38fd1498Szrj // For _S_opcode_match 94*38fd1498Szrj __gnu_cxx::__aligned_membuf<_Matcher<char>> _M_matcher_storage; 95*38fd1498Szrj }; 96*38fd1498Szrj 97*38fd1498Szrj protected: 98*38fd1498Szrj explicit _State_base(_Opcode __opcode) 99*38fd1498Szrj : _M_opcode(__opcode), _M_next(_S_invalid_state_id) 100*38fd1498Szrj { } 101*38fd1498Szrj 102*38fd1498Szrj public: 103*38fd1498Szrj bool 104*38fd1498Szrj _M_has_alt() 105*38fd1498Szrj { 106*38fd1498Szrj return _M_opcode == _S_opcode_alternative 107*38fd1498Szrj || _M_opcode == _S_opcode_repeat 108*38fd1498Szrj || _M_opcode == _S_opcode_subexpr_lookahead; 109*38fd1498Szrj } 110*38fd1498Szrj 111*38fd1498Szrj #ifdef _GLIBCXX_DEBUG 112*38fd1498Szrj std::ostream& 113*38fd1498Szrj _M_print(std::ostream& ostr) const; 114*38fd1498Szrj 115*38fd1498Szrj // Prints graphviz dot commands for state. 116*38fd1498Szrj std::ostream& 117*38fd1498Szrj _M_dot(std::ostream& __ostr, _StateIdT __id) const; 118*38fd1498Szrj #endif 119*38fd1498Szrj }; 120*38fd1498Szrj 121*38fd1498Szrj template<typename _Char_type> 122*38fd1498Szrj struct _State : _State_base 123*38fd1498Szrj { 124*38fd1498Szrj typedef _Matcher<_Char_type> _MatcherT; 125*38fd1498Szrj static_assert(sizeof(_MatcherT) == sizeof(_Matcher<char>), 126*38fd1498Szrj "std::function<bool(T)> has the same size as " 127*38fd1498Szrj "std::function<bool(char)>"); 128*38fd1498Szrj static_assert(alignof(_MatcherT) == alignof(_Matcher<char>), 129*38fd1498Szrj "std::function<bool(T)> has the same alignment as " 130*38fd1498Szrj "std::function<bool(char)>"); 131*38fd1498Szrj 132*38fd1498Szrj explicit 133*38fd1498Szrj _State(_Opcode __opcode) : _State_base(__opcode) 134*38fd1498Szrj { 135*38fd1498Szrj if (_M_opcode() == _S_opcode_match) 136*38fd1498Szrj new (this->_M_matcher_storage._M_addr()) _MatcherT(); 137*38fd1498Szrj } 138*38fd1498Szrj 139*38fd1498Szrj _State(const _State& __rhs) : _State_base(__rhs) 140*38fd1498Szrj { 141*38fd1498Szrj if (__rhs._M_opcode() == _S_opcode_match) 142*38fd1498Szrj new (this->_M_matcher_storage._M_addr()) 143*38fd1498Szrj _MatcherT(__rhs._M_get_matcher()); 144*38fd1498Szrj } 145*38fd1498Szrj 146*38fd1498Szrj _State(_State&& __rhs) : _State_base(__rhs) 147*38fd1498Szrj { 148*38fd1498Szrj if (__rhs._M_opcode() == _S_opcode_match) 149*38fd1498Szrj new (this->_M_matcher_storage._M_addr()) 150*38fd1498Szrj _MatcherT(std::move(__rhs._M_get_matcher())); 151*38fd1498Szrj } 152*38fd1498Szrj 153*38fd1498Szrj _State& 154*38fd1498Szrj operator=(const _State&) = delete; 155*38fd1498Szrj 156*38fd1498Szrj ~_State() 157*38fd1498Szrj { 158*38fd1498Szrj if (_M_opcode() == _S_opcode_match) 159*38fd1498Szrj _M_get_matcher().~_MatcherT(); 160*38fd1498Szrj } 161*38fd1498Szrj 162*38fd1498Szrj // Since correct ctor and dtor rely on _M_opcode, it's better not to 163*38fd1498Szrj // change it over time. 164*38fd1498Szrj _Opcode 165*38fd1498Szrj _M_opcode() const 166*38fd1498Szrj { return _State_base::_M_opcode; } 167*38fd1498Szrj 168*38fd1498Szrj bool 169*38fd1498Szrj _M_matches(_Char_type __char) const 170*38fd1498Szrj { return _M_get_matcher()(__char); } 171*38fd1498Szrj 172*38fd1498Szrj _MatcherT& 173*38fd1498Szrj _M_get_matcher() 174*38fd1498Szrj { return *static_cast<_MatcherT*>(this->_M_matcher_storage._M_addr()); } 175*38fd1498Szrj 176*38fd1498Szrj const _MatcherT& 177*38fd1498Szrj _M_get_matcher() const 178*38fd1498Szrj { 179*38fd1498Szrj return *static_cast<const _MatcherT*>( 180*38fd1498Szrj this->_M_matcher_storage._M_addr()); 181*38fd1498Szrj } 182*38fd1498Szrj }; 183*38fd1498Szrj 184*38fd1498Szrj struct _NFA_base 185*38fd1498Szrj { 186*38fd1498Szrj typedef size_t _SizeT; 187*38fd1498Szrj typedef regex_constants::syntax_option_type _FlagT; 188*38fd1498Szrj 189*38fd1498Szrj explicit 190*38fd1498Szrj _NFA_base(_FlagT __f) 191*38fd1498Szrj : _M_flags(__f), _M_start_state(0), _M_subexpr_count(0), 192*38fd1498Szrj _M_has_backref(false) 193*38fd1498Szrj { } 194*38fd1498Szrj 195*38fd1498Szrj _NFA_base(_NFA_base&&) = default; 196*38fd1498Szrj 197*38fd1498Szrj protected: 198*38fd1498Szrj ~_NFA_base() = default; 199*38fd1498Szrj 200*38fd1498Szrj public: 201*38fd1498Szrj _FlagT 202*38fd1498Szrj _M_options() const 203*38fd1498Szrj { return _M_flags; } 204*38fd1498Szrj 205*38fd1498Szrj _StateIdT 206*38fd1498Szrj _M_start() const 207*38fd1498Szrj { return _M_start_state; } 208*38fd1498Szrj 209*38fd1498Szrj _SizeT 210*38fd1498Szrj _M_sub_count() const 211*38fd1498Szrj { return _M_subexpr_count; } 212*38fd1498Szrj 213*38fd1498Szrj std::vector<size_t> _M_paren_stack; 214*38fd1498Szrj _FlagT _M_flags; 215*38fd1498Szrj _StateIdT _M_start_state; 216*38fd1498Szrj _SizeT _M_subexpr_count; 217*38fd1498Szrj bool _M_has_backref; 218*38fd1498Szrj }; 219*38fd1498Szrj 220*38fd1498Szrj template<typename _TraitsT> 221*38fd1498Szrj struct _NFA 222*38fd1498Szrj : _NFA_base, std::vector<_State<typename _TraitsT::char_type>> 223*38fd1498Szrj { 224*38fd1498Szrj typedef typename _TraitsT::char_type _Char_type; 225*38fd1498Szrj typedef _State<_Char_type> _StateT; 226*38fd1498Szrj typedef _Matcher<_Char_type> _MatcherT; 227*38fd1498Szrj 228*38fd1498Szrj _NFA(const typename _TraitsT::locale_type& __loc, _FlagT __flags) 229*38fd1498Szrj : _NFA_base(__flags) 230*38fd1498Szrj { _M_traits.imbue(__loc); } 231*38fd1498Szrj 232*38fd1498Szrj // for performance reasons _NFA objects should only be moved not copied 233*38fd1498Szrj _NFA(const _NFA&) = delete; 234*38fd1498Szrj _NFA(_NFA&&) = default; 235*38fd1498Szrj 236*38fd1498Szrj _StateIdT 237*38fd1498Szrj _M_insert_accept() 238*38fd1498Szrj { 239*38fd1498Szrj auto __ret = _M_insert_state(_StateT(_S_opcode_accept)); 240*38fd1498Szrj return __ret; 241*38fd1498Szrj } 242*38fd1498Szrj 243*38fd1498Szrj _StateIdT 244*38fd1498Szrj _M_insert_alt(_StateIdT __next, _StateIdT __alt, 245*38fd1498Szrj bool __neg __attribute__((__unused__))) 246*38fd1498Szrj { 247*38fd1498Szrj _StateT __tmp(_S_opcode_alternative); 248*38fd1498Szrj // It labels every quantifier to make greedy comparison easier in BFS 249*38fd1498Szrj // approach. 250*38fd1498Szrj __tmp._M_next = __next; 251*38fd1498Szrj __tmp._M_alt = __alt; 252*38fd1498Szrj return _M_insert_state(std::move(__tmp)); 253*38fd1498Szrj } 254*38fd1498Szrj 255*38fd1498Szrj _StateIdT 256*38fd1498Szrj _M_insert_repeat(_StateIdT __next, _StateIdT __alt, bool __neg) 257*38fd1498Szrj { 258*38fd1498Szrj _StateT __tmp(_S_opcode_repeat); 259*38fd1498Szrj // It labels every quantifier to make greedy comparison easier in BFS 260*38fd1498Szrj // approach. 261*38fd1498Szrj __tmp._M_next = __next; 262*38fd1498Szrj __tmp._M_alt = __alt; 263*38fd1498Szrj __tmp._M_neg = __neg; 264*38fd1498Szrj return _M_insert_state(std::move(__tmp)); 265*38fd1498Szrj } 266*38fd1498Szrj 267*38fd1498Szrj _StateIdT 268*38fd1498Szrj _M_insert_matcher(_MatcherT __m) 269*38fd1498Szrj { 270*38fd1498Szrj _StateT __tmp(_S_opcode_match); 271*38fd1498Szrj __tmp._M_get_matcher() = std::move(__m); 272*38fd1498Szrj return _M_insert_state(std::move(__tmp)); 273*38fd1498Szrj } 274*38fd1498Szrj 275*38fd1498Szrj _StateIdT 276*38fd1498Szrj _M_insert_subexpr_begin() 277*38fd1498Szrj { 278*38fd1498Szrj auto __id = this->_M_subexpr_count++; 279*38fd1498Szrj this->_M_paren_stack.push_back(__id); 280*38fd1498Szrj _StateT __tmp(_S_opcode_subexpr_begin); 281*38fd1498Szrj __tmp._M_subexpr = __id; 282*38fd1498Szrj return _M_insert_state(std::move(__tmp)); 283*38fd1498Szrj } 284*38fd1498Szrj 285*38fd1498Szrj _StateIdT 286*38fd1498Szrj _M_insert_subexpr_end() 287*38fd1498Szrj { 288*38fd1498Szrj _StateT __tmp(_S_opcode_subexpr_end); 289*38fd1498Szrj __tmp._M_subexpr = this->_M_paren_stack.back(); 290*38fd1498Szrj this->_M_paren_stack.pop_back(); 291*38fd1498Szrj return _M_insert_state(std::move(__tmp)); 292*38fd1498Szrj } 293*38fd1498Szrj 294*38fd1498Szrj _StateIdT 295*38fd1498Szrj _M_insert_backref(size_t __index); 296*38fd1498Szrj 297*38fd1498Szrj _StateIdT 298*38fd1498Szrj _M_insert_line_begin() 299*38fd1498Szrj { return _M_insert_state(_StateT(_S_opcode_line_begin_assertion)); } 300*38fd1498Szrj 301*38fd1498Szrj _StateIdT 302*38fd1498Szrj _M_insert_line_end() 303*38fd1498Szrj { return _M_insert_state(_StateT(_S_opcode_line_end_assertion)); } 304*38fd1498Szrj 305*38fd1498Szrj _StateIdT 306*38fd1498Szrj _M_insert_word_bound(bool __neg) 307*38fd1498Szrj { 308*38fd1498Szrj _StateT __tmp(_S_opcode_word_boundary); 309*38fd1498Szrj __tmp._M_neg = __neg; 310*38fd1498Szrj return _M_insert_state(std::move(__tmp)); 311*38fd1498Szrj } 312*38fd1498Szrj 313*38fd1498Szrj _StateIdT 314*38fd1498Szrj _M_insert_lookahead(_StateIdT __alt, bool __neg) 315*38fd1498Szrj { 316*38fd1498Szrj _StateT __tmp(_S_opcode_subexpr_lookahead); 317*38fd1498Szrj __tmp._M_alt = __alt; 318*38fd1498Szrj __tmp._M_neg = __neg; 319*38fd1498Szrj return _M_insert_state(std::move(__tmp)); 320*38fd1498Szrj } 321*38fd1498Szrj 322*38fd1498Szrj _StateIdT 323*38fd1498Szrj _M_insert_dummy() 324*38fd1498Szrj { return _M_insert_state(_StateT(_S_opcode_dummy)); } 325*38fd1498Szrj 326*38fd1498Szrj _StateIdT 327*38fd1498Szrj _M_insert_state(_StateT __s) 328*38fd1498Szrj { 329*38fd1498Szrj this->push_back(std::move(__s)); 330*38fd1498Szrj if (this->size() > _GLIBCXX_REGEX_STATE_LIMIT) 331*38fd1498Szrj __throw_regex_error( 332*38fd1498Szrj regex_constants::error_space, 333*38fd1498Szrj "Number of NFA states exceeds limit. Please use shorter regex " 334*38fd1498Szrj "string, or use smaller brace expression, or make " 335*38fd1498Szrj "_GLIBCXX_REGEX_STATE_LIMIT larger."); 336*38fd1498Szrj return this->size()-1; 337*38fd1498Szrj } 338*38fd1498Szrj 339*38fd1498Szrj // Eliminate dummy node in this NFA to make it compact. 340*38fd1498Szrj void 341*38fd1498Szrj _M_eliminate_dummy(); 342*38fd1498Szrj 343*38fd1498Szrj #ifdef _GLIBCXX_DEBUG 344*38fd1498Szrj std::ostream& 345*38fd1498Szrj _M_dot(std::ostream& __ostr) const; 346*38fd1498Szrj #endif 347*38fd1498Szrj public: 348*38fd1498Szrj _TraitsT _M_traits; 349*38fd1498Szrj }; 350*38fd1498Szrj 351*38fd1498Szrj /// Describes a sequence of one or more %_State, its current start 352*38fd1498Szrj /// and end(s). This structure contains fragments of an NFA during 353*38fd1498Szrj /// construction. 354*38fd1498Szrj template<typename _TraitsT> 355*38fd1498Szrj class _StateSeq 356*38fd1498Szrj { 357*38fd1498Szrj public: 358*38fd1498Szrj typedef _NFA<_TraitsT> _RegexT; 359*38fd1498Szrj 360*38fd1498Szrj public: 361*38fd1498Szrj _StateSeq(_RegexT& __nfa, _StateIdT __s) 362*38fd1498Szrj : _M_nfa(__nfa), _M_start(__s), _M_end(__s) 363*38fd1498Szrj { } 364*38fd1498Szrj 365*38fd1498Szrj _StateSeq(_RegexT& __nfa, _StateIdT __s, _StateIdT __end) 366*38fd1498Szrj : _M_nfa(__nfa), _M_start(__s), _M_end(__end) 367*38fd1498Szrj { } 368*38fd1498Szrj 369*38fd1498Szrj // Append a state on *this and change *this to the new sequence. 370*38fd1498Szrj void 371*38fd1498Szrj _M_append(_StateIdT __id) 372*38fd1498Szrj { 373*38fd1498Szrj _M_nfa[_M_end]._M_next = __id; 374*38fd1498Szrj _M_end = __id; 375*38fd1498Szrj } 376*38fd1498Szrj 377*38fd1498Szrj // Append a sequence on *this and change *this to the new sequence. 378*38fd1498Szrj void 379*38fd1498Szrj _M_append(const _StateSeq& __s) 380*38fd1498Szrj { 381*38fd1498Szrj _M_nfa[_M_end]._M_next = __s._M_start; 382*38fd1498Szrj _M_end = __s._M_end; 383*38fd1498Szrj } 384*38fd1498Szrj 385*38fd1498Szrj // Clones an entire sequence. 386*38fd1498Szrj _StateSeq 387*38fd1498Szrj _M_clone(); 388*38fd1498Szrj 389*38fd1498Szrj public: 390*38fd1498Szrj _RegexT& _M_nfa; 391*38fd1498Szrj _StateIdT _M_start; 392*38fd1498Szrj _StateIdT _M_end; 393*38fd1498Szrj }; 394*38fd1498Szrj 395*38fd1498Szrj //@} regex-detail 396*38fd1498Szrj } // namespace __detail 397*38fd1498Szrj 398*38fd1498Szrj _GLIBCXX_END_NAMESPACE_VERSION 399*38fd1498Szrj } // namespace std 400*38fd1498Szrj 401*38fd1498Szrj #include <bits/regex_automaton.tcc> 402