xref: /netbsd-src/external/gpl3/gcc/dist/libstdc++-v3/include/bits/regex_automaton.h (revision 0a3071956a3a9fdebdbf7f338cf2d439b45fc728)
14d5abbe8Smrg // class template regex -*- C++ -*-
24d5abbe8Smrg 
3b1e83836Smrg // Copyright (C) 2013-2022 Free Software Foundation, Inc.
44d5abbe8Smrg //
54d5abbe8Smrg // This file is part of the GNU ISO C++ Library.  This library is free
64d5abbe8Smrg // software; you can redistribute it and/or modify it under the
74d5abbe8Smrg // terms of the GNU General Public License as published by the
84d5abbe8Smrg // Free Software Foundation; either version 3, or (at your option)
94d5abbe8Smrg // any later version.
104d5abbe8Smrg 
114d5abbe8Smrg // This library is distributed in the hope that it will be useful,
124d5abbe8Smrg // but WITHOUT ANY WARRANTY; without even the implied warranty of
134d5abbe8Smrg // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
144d5abbe8Smrg // GNU General Public License for more details.
154d5abbe8Smrg 
164d5abbe8Smrg // Under Section 7 of GPL version 3, you are granted additional
174d5abbe8Smrg // permissions described in the GCC Runtime Library Exception, version
184d5abbe8Smrg // 3.1, as published by the Free Software Foundation.
194d5abbe8Smrg 
204d5abbe8Smrg // You should have received a copy of the GNU General Public License and
214d5abbe8Smrg // a copy of the GCC Runtime Library Exception along with this program;
224d5abbe8Smrg // see the files COPYING3 and COPYING.RUNTIME respectively.  If not, see
234d5abbe8Smrg // <http://www.gnu.org/licenses/>.
244d5abbe8Smrg 
254d5abbe8Smrg /**
264d5abbe8Smrg  *  @file bits/regex_automaton.h
274d5abbe8Smrg  *  This is an internal header file, included by other library headers.
284d5abbe8Smrg  *  Do not attempt to use it directly. @headername{regex}
294d5abbe8Smrg  */
304d5abbe8Smrg 
314d5abbe8Smrg // This macro defines the maximal state number a NFA can have.
324d5abbe8Smrg #ifndef _GLIBCXX_REGEX_STATE_LIMIT
334d5abbe8Smrg #define _GLIBCXX_REGEX_STATE_LIMIT 100000
344d5abbe8Smrg #endif
354d5abbe8Smrg 
_GLIBCXX_VISIBILITY(default)364d5abbe8Smrg namespace std _GLIBCXX_VISIBILITY(default)
374d5abbe8Smrg {
388b6133e5Smrg _GLIBCXX_BEGIN_NAMESPACE_VERSION
398b6133e5Smrg 
40a3e9eb18Smrg namespace __detail
41a3e9eb18Smrg {
424d5abbe8Smrg   /**
434d5abbe8Smrg    *  @defgroup regex-detail Base and Implementation Classes
444d5abbe8Smrg    *  @ingroup regex
454d5abbe8Smrg    *  @{
464d5abbe8Smrg    */
474d5abbe8Smrg 
484d5abbe8Smrg   typedef long _StateIdT;
49b1e83836Smrg   _GLIBCXX17_INLINE constexpr _StateIdT _S_invalid_state_id  = -1;
504d5abbe8Smrg 
514d5abbe8Smrg   template<typename _CharT>
524d5abbe8Smrg     using _Matcher = std::function<bool (_CharT)>;
534d5abbe8Smrg 
544d5abbe8Smrg   /// Operation codes that define the type of transitions within the base NFA
554d5abbe8Smrg   /// that represents the regular expression.
564d5abbe8Smrg   enum _Opcode : int
574d5abbe8Smrg   {
584d5abbe8Smrg       _S_opcode_unknown,
594d5abbe8Smrg       _S_opcode_alternative,
604d5abbe8Smrg       _S_opcode_repeat,
614d5abbe8Smrg       _S_opcode_backref,
624d5abbe8Smrg       _S_opcode_line_begin_assertion,
634d5abbe8Smrg       _S_opcode_line_end_assertion,
644d5abbe8Smrg       _S_opcode_word_boundary,
654d5abbe8Smrg       _S_opcode_subexpr_lookahead,
664d5abbe8Smrg       _S_opcode_subexpr_begin,
674d5abbe8Smrg       _S_opcode_subexpr_end,
684d5abbe8Smrg       _S_opcode_dummy,
694d5abbe8Smrg       _S_opcode_match,
704d5abbe8Smrg       _S_opcode_accept,
714d5abbe8Smrg   };
724d5abbe8Smrg 
734d5abbe8Smrg   struct _State_base
744d5abbe8Smrg   {
75f9a78e0eSmrg   protected:
764d5abbe8Smrg     _Opcode      _M_opcode;           // type of outgoing transition
77f9a78e0eSmrg 
78f9a78e0eSmrg   public:
794d5abbe8Smrg     _StateIdT    _M_next;             // outgoing transition
804d5abbe8Smrg     union // Since they are mutually exclusive.
814d5abbe8Smrg     {
824d5abbe8Smrg       size_t _M_subexpr;        // for _S_opcode_subexpr_*
834d5abbe8Smrg       size_t _M_backref_index;  // for _S_opcode_backref
844d5abbe8Smrg       struct
854d5abbe8Smrg       {
864d5abbe8Smrg 	// for _S_opcode_alternative, _S_opcode_repeat and
874d5abbe8Smrg 	// _S_opcode_subexpr_lookahead
884d5abbe8Smrg 	_StateIdT  _M_alt;
894d5abbe8Smrg 	// for _S_opcode_word_boundary or _S_opcode_subexpr_lookahead or
904d5abbe8Smrg 	// quantifiers (ungreedy if set true)
914d5abbe8Smrg 	bool       _M_neg;
924d5abbe8Smrg       };
93f9a78e0eSmrg       // For _S_opcode_match
94f9a78e0eSmrg       __gnu_cxx::__aligned_membuf<_Matcher<char>> _M_matcher_storage;
954d5abbe8Smrg     };
964d5abbe8Smrg 
97f9a78e0eSmrg   protected:
987d4dc15bSmrg     explicit _State_base(_Opcode __opcode) noexcept
994d5abbe8Smrg     : _M_opcode(__opcode), _M_next(_S_invalid_state_id)
1004d5abbe8Smrg     { }
1014d5abbe8Smrg 
1024d5abbe8Smrg   public:
103f9a78e0eSmrg     bool
1047d4dc15bSmrg     _M_has_alt() const noexcept
105f9a78e0eSmrg     {
106f9a78e0eSmrg       return _M_opcode == _S_opcode_alternative
107f9a78e0eSmrg 	|| _M_opcode == _S_opcode_repeat
108f9a78e0eSmrg 	|| _M_opcode == _S_opcode_subexpr_lookahead;
109f9a78e0eSmrg     }
110f9a78e0eSmrg 
1114d5abbe8Smrg #ifdef _GLIBCXX_DEBUG
1124d5abbe8Smrg     std::ostream&
113*0a307195Smrg     _M_print(std::ostream& __ostr) const;
1144d5abbe8Smrg 
1154d5abbe8Smrg     // Prints graphviz dot commands for state.
1164d5abbe8Smrg     std::ostream&
1174d5abbe8Smrg     _M_dot(std::ostream& __ostr, _StateIdT __id) const;
1184d5abbe8Smrg #endif
1194d5abbe8Smrg   };
1204d5abbe8Smrg 
121f9a78e0eSmrg   template<typename _Char_type>
1224d5abbe8Smrg     struct _State : _State_base
1234d5abbe8Smrg     {
124f9a78e0eSmrg       typedef _Matcher<_Char_type> _MatcherT;
125f9a78e0eSmrg       static_assert(sizeof(_MatcherT) == sizeof(_Matcher<char>),
126f9a78e0eSmrg 		    "std::function<bool(T)> has the same size as "
127f9a78e0eSmrg 		    "std::function<bool(char)>");
128f9a78e0eSmrg       static_assert(alignof(_MatcherT) == alignof(_Matcher<char>),
129f9a78e0eSmrg 		    "std::function<bool(T)> has the same alignment as "
130f9a78e0eSmrg 		    "std::function<bool(char)>");
1314d5abbe8Smrg 
132f9a78e0eSmrg       explicit
1337d4dc15bSmrg       _State(_Opcode __opcode) noexcept : _State_base(__opcode)
134f9a78e0eSmrg       {
135f9a78e0eSmrg 	if (_M_opcode() == _S_opcode_match)
136f9a78e0eSmrg 	  new (this->_M_matcher_storage._M_addr()) _MatcherT();
137f9a78e0eSmrg       }
1384d5abbe8Smrg 
139f9a78e0eSmrg       _State(const _State& __rhs) : _State_base(__rhs)
140f9a78e0eSmrg       {
141f9a78e0eSmrg 	if (__rhs._M_opcode() == _S_opcode_match)
142f9a78e0eSmrg 	  new (this->_M_matcher_storage._M_addr())
143f9a78e0eSmrg 	    _MatcherT(__rhs._M_get_matcher());
144f9a78e0eSmrg       }
145f9a78e0eSmrg 
1467d4dc15bSmrg       _State(_State&& __rhs) noexcept : _State_base(__rhs)
147f9a78e0eSmrg       {
148f9a78e0eSmrg 	if (__rhs._M_opcode() == _S_opcode_match)
149f9a78e0eSmrg 	  new (this->_M_matcher_storage._M_addr())
150f9a78e0eSmrg 	    _MatcherT(std::move(__rhs._M_get_matcher()));
151f9a78e0eSmrg       }
152f9a78e0eSmrg 
153f9a78e0eSmrg       _State&
154f9a78e0eSmrg       operator=(const _State&) = delete;
155f9a78e0eSmrg 
156f9a78e0eSmrg       ~_State()
157f9a78e0eSmrg       {
158f9a78e0eSmrg 	if (_M_opcode() == _S_opcode_match)
159f9a78e0eSmrg 	  _M_get_matcher().~_MatcherT();
160f9a78e0eSmrg       }
161f9a78e0eSmrg 
162f9a78e0eSmrg       // Since correct ctor and dtor rely on _M_opcode, it's better not to
163f9a78e0eSmrg       // change it over time.
164f9a78e0eSmrg       _Opcode
1657d4dc15bSmrg       _M_opcode() const noexcept
166f9a78e0eSmrg       { return _State_base::_M_opcode; }
167f9a78e0eSmrg 
168f9a78e0eSmrg       bool
169f9a78e0eSmrg       _M_matches(_Char_type __char) const
170f9a78e0eSmrg       { return _M_get_matcher()(__char); }
171f9a78e0eSmrg 
172f9a78e0eSmrg       _MatcherT&
1737d4dc15bSmrg       _M_get_matcher() noexcept
174f9a78e0eSmrg       { return *static_cast<_MatcherT*>(this->_M_matcher_storage._M_addr()); }
175f9a78e0eSmrg 
176f9a78e0eSmrg       const _MatcherT&
1777d4dc15bSmrg       _M_get_matcher() const noexcept
178f9a78e0eSmrg       {
179f9a78e0eSmrg 	return *static_cast<const _MatcherT*>(
180f9a78e0eSmrg 	    this->_M_matcher_storage._M_addr());
181f9a78e0eSmrg       }
1824d5abbe8Smrg     };
1834d5abbe8Smrg 
1844d5abbe8Smrg   struct _NFA_base
1854d5abbe8Smrg   {
1864d5abbe8Smrg     typedef regex_constants::syntax_option_type _FlagT;
1874d5abbe8Smrg 
1884d5abbe8Smrg     explicit
1897d4dc15bSmrg     _NFA_base(_FlagT __f) noexcept
1904d5abbe8Smrg     : _M_flags(__f), _M_start_state(0), _M_subexpr_count(0),
1914d5abbe8Smrg     _M_has_backref(false)
1924d5abbe8Smrg     { }
1934d5abbe8Smrg 
1944d5abbe8Smrg     _NFA_base(_NFA_base&&) = default;
1954d5abbe8Smrg 
1964d5abbe8Smrg   protected:
1974d5abbe8Smrg     ~_NFA_base() = default;
1984d5abbe8Smrg 
1994d5abbe8Smrg   public:
2004d5abbe8Smrg     _FlagT
2017d4dc15bSmrg     _M_options() const noexcept
2024d5abbe8Smrg     { return _M_flags; }
2034d5abbe8Smrg 
2044d5abbe8Smrg     _StateIdT
2057d4dc15bSmrg     _M_start() const noexcept
2064d5abbe8Smrg     { return _M_start_state; }
2074d5abbe8Smrg 
208b1e83836Smrg     size_t
2097d4dc15bSmrg     _M_sub_count() const noexcept
2104d5abbe8Smrg     { return _M_subexpr_count; }
2114d5abbe8Smrg 
212181254a7Smrg     _GLIBCXX_STD_C::vector<size_t> _M_paren_stack;
2134d5abbe8Smrg     _FlagT                    _M_flags;
2144d5abbe8Smrg     _StateIdT                 _M_start_state;
215b1e83836Smrg     size_t                    _M_subexpr_count;
2164d5abbe8Smrg     bool                      _M_has_backref;
2174d5abbe8Smrg   };
2184d5abbe8Smrg 
2194d5abbe8Smrg   template<typename _TraitsT>
2204d5abbe8Smrg     struct _NFA
221181254a7Smrg     : _NFA_base, _GLIBCXX_STD_C::vector<_State<typename _TraitsT::char_type>>
2224d5abbe8Smrg     {
223f9a78e0eSmrg       typedef typename _TraitsT::char_type	_Char_type;
224f9a78e0eSmrg       typedef _State<_Char_type>		_StateT;
225f9a78e0eSmrg       typedef _Matcher<_Char_type>		_MatcherT;
2264d5abbe8Smrg 
2274d5abbe8Smrg       _NFA(const typename _TraitsT::locale_type& __loc, _FlagT __flags)
2284d5abbe8Smrg       : _NFA_base(__flags)
2294d5abbe8Smrg       { _M_traits.imbue(__loc); }
2304d5abbe8Smrg 
2314d5abbe8Smrg       // for performance reasons _NFA objects should only be moved not copied
2324d5abbe8Smrg       _NFA(const _NFA&) = delete;
2334d5abbe8Smrg       _NFA(_NFA&&) = default;
2344d5abbe8Smrg 
2354d5abbe8Smrg       _StateIdT
2364d5abbe8Smrg       _M_insert_accept()
2374d5abbe8Smrg       {
2384d5abbe8Smrg 	auto __ret = _M_insert_state(_StateT(_S_opcode_accept));
2394d5abbe8Smrg 	return __ret;
2404d5abbe8Smrg       }
2414d5abbe8Smrg 
2424d5abbe8Smrg       _StateIdT
243b17d1066Smrg       _M_insert_alt(_StateIdT __next, _StateIdT __alt,
244b17d1066Smrg 		    bool __neg __attribute__((__unused__)))
2454d5abbe8Smrg       {
2464d5abbe8Smrg 	_StateT __tmp(_S_opcode_alternative);
2474d5abbe8Smrg 	// It labels every quantifier to make greedy comparison easier in BFS
2484d5abbe8Smrg 	// approach.
2494d5abbe8Smrg 	__tmp._M_next = __next;
2504d5abbe8Smrg 	__tmp._M_alt = __alt;
2514d5abbe8Smrg 	return _M_insert_state(std::move(__tmp));
2524d5abbe8Smrg       }
2534d5abbe8Smrg 
2544d5abbe8Smrg       _StateIdT
2554d5abbe8Smrg       _M_insert_repeat(_StateIdT __next, _StateIdT __alt, bool __neg)
2564d5abbe8Smrg       {
2574d5abbe8Smrg 	_StateT __tmp(_S_opcode_repeat);
2584d5abbe8Smrg 	// It labels every quantifier to make greedy comparison easier in BFS
2594d5abbe8Smrg 	// approach.
2604d5abbe8Smrg 	__tmp._M_next = __next;
2614d5abbe8Smrg 	__tmp._M_alt = __alt;
2624d5abbe8Smrg 	__tmp._M_neg = __neg;
2634d5abbe8Smrg 	return _M_insert_state(std::move(__tmp));
2644d5abbe8Smrg       }
2654d5abbe8Smrg 
2664d5abbe8Smrg       _StateIdT
2674d5abbe8Smrg       _M_insert_matcher(_MatcherT __m)
2684d5abbe8Smrg       {
2694d5abbe8Smrg 	_StateT __tmp(_S_opcode_match);
270f9a78e0eSmrg 	__tmp._M_get_matcher() = std::move(__m);
2714d5abbe8Smrg 	return _M_insert_state(std::move(__tmp));
2724d5abbe8Smrg       }
2734d5abbe8Smrg 
2744d5abbe8Smrg       _StateIdT
2754d5abbe8Smrg       _M_insert_subexpr_begin()
2764d5abbe8Smrg       {
2774d5abbe8Smrg 	auto __id = this->_M_subexpr_count++;
2784d5abbe8Smrg 	this->_M_paren_stack.push_back(__id);
2794d5abbe8Smrg 	_StateT __tmp(_S_opcode_subexpr_begin);
2804d5abbe8Smrg 	__tmp._M_subexpr = __id;
2814d5abbe8Smrg 	return _M_insert_state(std::move(__tmp));
2824d5abbe8Smrg       }
2834d5abbe8Smrg 
2844d5abbe8Smrg       _StateIdT
2854d5abbe8Smrg       _M_insert_subexpr_end()
2864d5abbe8Smrg       {
2874d5abbe8Smrg 	_StateT __tmp(_S_opcode_subexpr_end);
2884d5abbe8Smrg 	__tmp._M_subexpr = this->_M_paren_stack.back();
2894d5abbe8Smrg 	this->_M_paren_stack.pop_back();
2904d5abbe8Smrg 	return _M_insert_state(std::move(__tmp));
2914d5abbe8Smrg       }
2924d5abbe8Smrg 
2934d5abbe8Smrg       _StateIdT
2944d5abbe8Smrg       _M_insert_backref(size_t __index);
2954d5abbe8Smrg 
2964d5abbe8Smrg       _StateIdT
2974d5abbe8Smrg       _M_insert_line_begin()
2984d5abbe8Smrg       { return _M_insert_state(_StateT(_S_opcode_line_begin_assertion)); }
2994d5abbe8Smrg 
3004d5abbe8Smrg       _StateIdT
3014d5abbe8Smrg       _M_insert_line_end()
3024d5abbe8Smrg       { return _M_insert_state(_StateT(_S_opcode_line_end_assertion)); }
3034d5abbe8Smrg 
3044d5abbe8Smrg       _StateIdT
3054d5abbe8Smrg       _M_insert_word_bound(bool __neg)
3064d5abbe8Smrg       {
3074d5abbe8Smrg 	_StateT __tmp(_S_opcode_word_boundary);
3084d5abbe8Smrg 	__tmp._M_neg = __neg;
3094d5abbe8Smrg 	return _M_insert_state(std::move(__tmp));
3104d5abbe8Smrg       }
3114d5abbe8Smrg 
3124d5abbe8Smrg       _StateIdT
3134d5abbe8Smrg       _M_insert_lookahead(_StateIdT __alt, bool __neg)
3144d5abbe8Smrg       {
3154d5abbe8Smrg 	_StateT __tmp(_S_opcode_subexpr_lookahead);
3164d5abbe8Smrg 	__tmp._M_alt = __alt;
3174d5abbe8Smrg 	__tmp._M_neg = __neg;
3184d5abbe8Smrg 	return _M_insert_state(std::move(__tmp));
3194d5abbe8Smrg       }
3204d5abbe8Smrg 
3214d5abbe8Smrg       _StateIdT
3224d5abbe8Smrg       _M_insert_dummy()
3234d5abbe8Smrg       { return _M_insert_state(_StateT(_S_opcode_dummy)); }
3244d5abbe8Smrg 
3254d5abbe8Smrg       _StateIdT
3264d5abbe8Smrg       _M_insert_state(_StateT __s)
3274d5abbe8Smrg       {
3284d5abbe8Smrg 	this->push_back(std::move(__s));
3294d5abbe8Smrg 	if (this->size() > _GLIBCXX_REGEX_STATE_LIMIT)
330f9a78e0eSmrg 	  __throw_regex_error(
331f9a78e0eSmrg 	    regex_constants::error_space,
332f9a78e0eSmrg 	    "Number of NFA states exceeds limit. Please use shorter regex "
333f9a78e0eSmrg 	    "string, or use smaller brace expression, or make "
334f9a78e0eSmrg 	    "_GLIBCXX_REGEX_STATE_LIMIT larger.");
3354d5abbe8Smrg 	return this->size() - 1;
3364d5abbe8Smrg       }
3374d5abbe8Smrg 
3384d5abbe8Smrg       // Eliminate dummy node in this NFA to make it compact.
3394d5abbe8Smrg       void
3404d5abbe8Smrg       _M_eliminate_dummy();
3414d5abbe8Smrg 
3424d5abbe8Smrg #ifdef _GLIBCXX_DEBUG
3434d5abbe8Smrg       std::ostream&
3444d5abbe8Smrg       _M_dot(std::ostream& __ostr) const;
3454d5abbe8Smrg #endif
3464d5abbe8Smrg     public:
3474d5abbe8Smrg       _TraitsT                  _M_traits;
3484d5abbe8Smrg     };
3494d5abbe8Smrg 
3504d5abbe8Smrg   /// Describes a sequence of one or more %_State, its current start
3514d5abbe8Smrg   /// and end(s).  This structure contains fragments of an NFA during
3524d5abbe8Smrg   /// construction.
3534d5abbe8Smrg   template<typename _TraitsT>
3544d5abbe8Smrg     class _StateSeq
3554d5abbe8Smrg     {
3564d5abbe8Smrg     public:
3574d5abbe8Smrg       typedef _NFA<_TraitsT> _RegexT;
3584d5abbe8Smrg 
3594d5abbe8Smrg     public:
3604d5abbe8Smrg       _StateSeq(_RegexT& __nfa, _StateIdT __s)
3614d5abbe8Smrg       : _M_nfa(__nfa), _M_start(__s), _M_end(__s)
3624d5abbe8Smrg       { }
3634d5abbe8Smrg 
3644d5abbe8Smrg       _StateSeq(_RegexT& __nfa, _StateIdT __s, _StateIdT __end)
3654d5abbe8Smrg       : _M_nfa(__nfa), _M_start(__s), _M_end(__end)
3664d5abbe8Smrg       { }
3674d5abbe8Smrg 
3684d5abbe8Smrg       // Append a state on *this and change *this to the new sequence.
3694d5abbe8Smrg       void
3704d5abbe8Smrg       _M_append(_StateIdT __id)
3714d5abbe8Smrg       {
3724d5abbe8Smrg 	_M_nfa[_M_end]._M_next = __id;
3734d5abbe8Smrg 	_M_end = __id;
3744d5abbe8Smrg       }
3754d5abbe8Smrg 
3764d5abbe8Smrg       // Append a sequence on *this and change *this to the new sequence.
3774d5abbe8Smrg       void
3784d5abbe8Smrg       _M_append(const _StateSeq& __s)
3794d5abbe8Smrg       {
3804d5abbe8Smrg 	_M_nfa[_M_end]._M_next = __s._M_start;
3814d5abbe8Smrg 	_M_end = __s._M_end;
3824d5abbe8Smrg       }
3834d5abbe8Smrg 
3844d5abbe8Smrg       // Clones an entire sequence.
3854d5abbe8Smrg       _StateSeq
3864d5abbe8Smrg       _M_clone();
3874d5abbe8Smrg 
3884d5abbe8Smrg     public:
3894d5abbe8Smrg       _RegexT&  _M_nfa;
3904d5abbe8Smrg       _StateIdT _M_start;
3914d5abbe8Smrg       _StateIdT _M_end;
3924d5abbe8Smrg     };
3934d5abbe8Smrg 
394a448f87cSmrg  ///@} regex-detail
3958b6133e5Smrg } // namespace __detail
396a3e9eb18Smrg 
397a3e9eb18Smrg _GLIBCXX_END_NAMESPACE_VERSION
3984d5abbe8Smrg } // namespace std
3994d5abbe8Smrg 
4004d5abbe8Smrg #include <bits/regex_automaton.tcc>
401