xref: /dflybsd-src/contrib/gcc-8.0/libstdc++-v3/include/bits/regex_automaton.h (revision 95059079af47f9a66a175f374f2da1a5020e3255)
1*38fd1498Szrj // class template regex -*- C++ -*-
2*38fd1498Szrj 
3*38fd1498Szrj // Copyright (C) 2013-2018 Free Software Foundation, Inc.
4*38fd1498Szrj //
5*38fd1498Szrj // This file is part of the GNU ISO C++ Library.  This library is free
6*38fd1498Szrj // software; you can redistribute it and/or modify it under the
7*38fd1498Szrj // terms of the GNU General Public License as published by the
8*38fd1498Szrj // Free Software Foundation; either version 3, or (at your option)
9*38fd1498Szrj // any later version.
10*38fd1498Szrj 
11*38fd1498Szrj // This library is distributed in the hope that it will be useful,
12*38fd1498Szrj // but WITHOUT ANY WARRANTY; without even the implied warranty of
13*38fd1498Szrj // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
14*38fd1498Szrj // GNU General Public License for more details.
15*38fd1498Szrj 
16*38fd1498Szrj // Under Section 7 of GPL version 3, you are granted additional
17*38fd1498Szrj // permissions described in the GCC Runtime Library Exception, version
18*38fd1498Szrj // 3.1, as published by the Free Software Foundation.
19*38fd1498Szrj 
20*38fd1498Szrj // You should have received a copy of the GNU General Public License and
21*38fd1498Szrj // a copy of the GCC Runtime Library Exception along with this program;
22*38fd1498Szrj // see the files COPYING3 and COPYING.RUNTIME respectively.  If not, see
23*38fd1498Szrj // <http://www.gnu.org/licenses/>.
24*38fd1498Szrj 
25*38fd1498Szrj /**
26*38fd1498Szrj  *  @file bits/regex_automaton.h
27*38fd1498Szrj  *  This is an internal header file, included by other library headers.
28*38fd1498Szrj  *  Do not attempt to use it directly. @headername{regex}
29*38fd1498Szrj  */
30*38fd1498Szrj 
31*38fd1498Szrj // This macro defines the maximal state number a NFA can have.
32*38fd1498Szrj #ifndef _GLIBCXX_REGEX_STATE_LIMIT
33*38fd1498Szrj #define _GLIBCXX_REGEX_STATE_LIMIT 100000
34*38fd1498Szrj #endif
35*38fd1498Szrj 
_GLIBCXX_VISIBILITY(default)36*38fd1498Szrj namespace std _GLIBCXX_VISIBILITY(default)
37*38fd1498Szrj {
38*38fd1498Szrj _GLIBCXX_BEGIN_NAMESPACE_VERSION
39*38fd1498Szrj 
40*38fd1498Szrj namespace __detail
41*38fd1498Szrj {
42*38fd1498Szrj   /**
43*38fd1498Szrj    *  @defgroup regex-detail Base and Implementation Classes
44*38fd1498Szrj    *  @ingroup regex
45*38fd1498Szrj    *  @{
46*38fd1498Szrj    */
47*38fd1498Szrj 
48*38fd1498Szrj   typedef long _StateIdT;
49*38fd1498Szrj   static const _StateIdT _S_invalid_state_id  = -1;
50*38fd1498Szrj 
51*38fd1498Szrj   template<typename _CharT>
52*38fd1498Szrj     using _Matcher = std::function<bool (_CharT)>;
53*38fd1498Szrj 
54*38fd1498Szrj   /// Operation codes that define the type of transitions within the base NFA
55*38fd1498Szrj   /// that represents the regular expression.
56*38fd1498Szrj   enum _Opcode : int
57*38fd1498Szrj   {
58*38fd1498Szrj       _S_opcode_unknown,
59*38fd1498Szrj       _S_opcode_alternative,
60*38fd1498Szrj       _S_opcode_repeat,
61*38fd1498Szrj       _S_opcode_backref,
62*38fd1498Szrj       _S_opcode_line_begin_assertion,
63*38fd1498Szrj       _S_opcode_line_end_assertion,
64*38fd1498Szrj       _S_opcode_word_boundary,
65*38fd1498Szrj       _S_opcode_subexpr_lookahead,
66*38fd1498Szrj       _S_opcode_subexpr_begin,
67*38fd1498Szrj       _S_opcode_subexpr_end,
68*38fd1498Szrj       _S_opcode_dummy,
69*38fd1498Szrj       _S_opcode_match,
70*38fd1498Szrj       _S_opcode_accept,
71*38fd1498Szrj   };
72*38fd1498Szrj 
73*38fd1498Szrj   struct _State_base
74*38fd1498Szrj   {
75*38fd1498Szrj   protected:
76*38fd1498Szrj     _Opcode      _M_opcode;           // type of outgoing transition
77*38fd1498Szrj 
78*38fd1498Szrj   public:
79*38fd1498Szrj     _StateIdT    _M_next;             // outgoing transition
80*38fd1498Szrj     union // Since they are mutually exclusive.
81*38fd1498Szrj     {
82*38fd1498Szrj       size_t _M_subexpr;        // for _S_opcode_subexpr_*
83*38fd1498Szrj       size_t _M_backref_index;  // for _S_opcode_backref
84*38fd1498Szrj       struct
85*38fd1498Szrj       {
86*38fd1498Szrj 	// for _S_opcode_alternative, _S_opcode_repeat and
87*38fd1498Szrj 	// _S_opcode_subexpr_lookahead
88*38fd1498Szrj 	_StateIdT  _M_alt;
89*38fd1498Szrj 	// for _S_opcode_word_boundary or _S_opcode_subexpr_lookahead or
90*38fd1498Szrj 	// quantifiers (ungreedy if set true)
91*38fd1498Szrj 	bool       _M_neg;
92*38fd1498Szrj       };
93*38fd1498Szrj       // For _S_opcode_match
94*38fd1498Szrj       __gnu_cxx::__aligned_membuf<_Matcher<char>> _M_matcher_storage;
95*38fd1498Szrj     };
96*38fd1498Szrj 
97*38fd1498Szrj   protected:
98*38fd1498Szrj     explicit _State_base(_Opcode __opcode)
99*38fd1498Szrj     : _M_opcode(__opcode), _M_next(_S_invalid_state_id)
100*38fd1498Szrj     { }
101*38fd1498Szrj 
102*38fd1498Szrj   public:
103*38fd1498Szrj     bool
104*38fd1498Szrj     _M_has_alt()
105*38fd1498Szrj     {
106*38fd1498Szrj       return _M_opcode == _S_opcode_alternative
107*38fd1498Szrj 	|| _M_opcode == _S_opcode_repeat
108*38fd1498Szrj 	|| _M_opcode == _S_opcode_subexpr_lookahead;
109*38fd1498Szrj     }
110*38fd1498Szrj 
111*38fd1498Szrj #ifdef _GLIBCXX_DEBUG
112*38fd1498Szrj     std::ostream&
113*38fd1498Szrj     _M_print(std::ostream& ostr) const;
114*38fd1498Szrj 
115*38fd1498Szrj     // Prints graphviz dot commands for state.
116*38fd1498Szrj     std::ostream&
117*38fd1498Szrj     _M_dot(std::ostream& __ostr, _StateIdT __id) const;
118*38fd1498Szrj #endif
119*38fd1498Szrj   };
120*38fd1498Szrj 
121*38fd1498Szrj   template<typename _Char_type>
122*38fd1498Szrj     struct _State : _State_base
123*38fd1498Szrj     {
124*38fd1498Szrj       typedef _Matcher<_Char_type> _MatcherT;
125*38fd1498Szrj       static_assert(sizeof(_MatcherT) == sizeof(_Matcher<char>),
126*38fd1498Szrj 		    "std::function<bool(T)> has the same size as "
127*38fd1498Szrj 		    "std::function<bool(char)>");
128*38fd1498Szrj       static_assert(alignof(_MatcherT) == alignof(_Matcher<char>),
129*38fd1498Szrj 		    "std::function<bool(T)> has the same alignment as "
130*38fd1498Szrj 		    "std::function<bool(char)>");
131*38fd1498Szrj 
132*38fd1498Szrj       explicit
133*38fd1498Szrj       _State(_Opcode __opcode) : _State_base(__opcode)
134*38fd1498Szrj       {
135*38fd1498Szrj 	if (_M_opcode() == _S_opcode_match)
136*38fd1498Szrj 	  new (this->_M_matcher_storage._M_addr()) _MatcherT();
137*38fd1498Szrj       }
138*38fd1498Szrj 
139*38fd1498Szrj       _State(const _State& __rhs) : _State_base(__rhs)
140*38fd1498Szrj       {
141*38fd1498Szrj 	if (__rhs._M_opcode() == _S_opcode_match)
142*38fd1498Szrj 	  new (this->_M_matcher_storage._M_addr())
143*38fd1498Szrj 	    _MatcherT(__rhs._M_get_matcher());
144*38fd1498Szrj       }
145*38fd1498Szrj 
146*38fd1498Szrj       _State(_State&& __rhs) : _State_base(__rhs)
147*38fd1498Szrj       {
148*38fd1498Szrj 	if (__rhs._M_opcode() == _S_opcode_match)
149*38fd1498Szrj 	  new (this->_M_matcher_storage._M_addr())
150*38fd1498Szrj 	    _MatcherT(std::move(__rhs._M_get_matcher()));
151*38fd1498Szrj       }
152*38fd1498Szrj 
153*38fd1498Szrj       _State&
154*38fd1498Szrj       operator=(const _State&) = delete;
155*38fd1498Szrj 
156*38fd1498Szrj       ~_State()
157*38fd1498Szrj       {
158*38fd1498Szrj 	if (_M_opcode() == _S_opcode_match)
159*38fd1498Szrj 	  _M_get_matcher().~_MatcherT();
160*38fd1498Szrj       }
161*38fd1498Szrj 
162*38fd1498Szrj       // Since correct ctor and dtor rely on _M_opcode, it's better not to
163*38fd1498Szrj       // change it over time.
164*38fd1498Szrj       _Opcode
165*38fd1498Szrj       _M_opcode() const
166*38fd1498Szrj       { return _State_base::_M_opcode; }
167*38fd1498Szrj 
168*38fd1498Szrj       bool
169*38fd1498Szrj       _M_matches(_Char_type __char) const
170*38fd1498Szrj       { return _M_get_matcher()(__char); }
171*38fd1498Szrj 
172*38fd1498Szrj       _MatcherT&
173*38fd1498Szrj       _M_get_matcher()
174*38fd1498Szrj       { return *static_cast<_MatcherT*>(this->_M_matcher_storage._M_addr()); }
175*38fd1498Szrj 
176*38fd1498Szrj       const _MatcherT&
177*38fd1498Szrj       _M_get_matcher() const
178*38fd1498Szrj       {
179*38fd1498Szrj 	return *static_cast<const _MatcherT*>(
180*38fd1498Szrj 	    this->_M_matcher_storage._M_addr());
181*38fd1498Szrj       }
182*38fd1498Szrj     };
183*38fd1498Szrj 
184*38fd1498Szrj   struct _NFA_base
185*38fd1498Szrj   {
186*38fd1498Szrj     typedef size_t                              _SizeT;
187*38fd1498Szrj     typedef regex_constants::syntax_option_type _FlagT;
188*38fd1498Szrj 
189*38fd1498Szrj     explicit
190*38fd1498Szrj     _NFA_base(_FlagT __f)
191*38fd1498Szrj     : _M_flags(__f), _M_start_state(0), _M_subexpr_count(0),
192*38fd1498Szrj     _M_has_backref(false)
193*38fd1498Szrj     { }
194*38fd1498Szrj 
195*38fd1498Szrj     _NFA_base(_NFA_base&&) = default;
196*38fd1498Szrj 
197*38fd1498Szrj   protected:
198*38fd1498Szrj     ~_NFA_base() = default;
199*38fd1498Szrj 
200*38fd1498Szrj   public:
201*38fd1498Szrj     _FlagT
202*38fd1498Szrj     _M_options() const
203*38fd1498Szrj     { return _M_flags; }
204*38fd1498Szrj 
205*38fd1498Szrj     _StateIdT
206*38fd1498Szrj     _M_start() const
207*38fd1498Szrj     { return _M_start_state; }
208*38fd1498Szrj 
209*38fd1498Szrj     _SizeT
210*38fd1498Szrj     _M_sub_count() const
211*38fd1498Szrj     { return _M_subexpr_count; }
212*38fd1498Szrj 
213*38fd1498Szrj     std::vector<size_t>       _M_paren_stack;
214*38fd1498Szrj     _FlagT                    _M_flags;
215*38fd1498Szrj     _StateIdT                 _M_start_state;
216*38fd1498Szrj     _SizeT                    _M_subexpr_count;
217*38fd1498Szrj     bool                      _M_has_backref;
218*38fd1498Szrj   };
219*38fd1498Szrj 
220*38fd1498Szrj   template<typename _TraitsT>
221*38fd1498Szrj     struct _NFA
222*38fd1498Szrj     : _NFA_base, std::vector<_State<typename _TraitsT::char_type>>
223*38fd1498Szrj     {
224*38fd1498Szrj       typedef typename _TraitsT::char_type	_Char_type;
225*38fd1498Szrj       typedef _State<_Char_type>		_StateT;
226*38fd1498Szrj       typedef _Matcher<_Char_type>		_MatcherT;
227*38fd1498Szrj 
228*38fd1498Szrj       _NFA(const typename _TraitsT::locale_type& __loc, _FlagT __flags)
229*38fd1498Szrj       : _NFA_base(__flags)
230*38fd1498Szrj       { _M_traits.imbue(__loc); }
231*38fd1498Szrj 
232*38fd1498Szrj       // for performance reasons _NFA objects should only be moved not copied
233*38fd1498Szrj       _NFA(const _NFA&) = delete;
234*38fd1498Szrj       _NFA(_NFA&&) = default;
235*38fd1498Szrj 
236*38fd1498Szrj       _StateIdT
237*38fd1498Szrj       _M_insert_accept()
238*38fd1498Szrj       {
239*38fd1498Szrj 	auto __ret = _M_insert_state(_StateT(_S_opcode_accept));
240*38fd1498Szrj 	return __ret;
241*38fd1498Szrj       }
242*38fd1498Szrj 
243*38fd1498Szrj       _StateIdT
244*38fd1498Szrj       _M_insert_alt(_StateIdT __next, _StateIdT __alt,
245*38fd1498Szrj 		    bool __neg __attribute__((__unused__)))
246*38fd1498Szrj       {
247*38fd1498Szrj 	_StateT __tmp(_S_opcode_alternative);
248*38fd1498Szrj 	// It labels every quantifier to make greedy comparison easier in BFS
249*38fd1498Szrj 	// approach.
250*38fd1498Szrj 	__tmp._M_next = __next;
251*38fd1498Szrj 	__tmp._M_alt = __alt;
252*38fd1498Szrj 	return _M_insert_state(std::move(__tmp));
253*38fd1498Szrj       }
254*38fd1498Szrj 
255*38fd1498Szrj       _StateIdT
256*38fd1498Szrj       _M_insert_repeat(_StateIdT __next, _StateIdT __alt, bool __neg)
257*38fd1498Szrj       {
258*38fd1498Szrj 	_StateT __tmp(_S_opcode_repeat);
259*38fd1498Szrj 	// It labels every quantifier to make greedy comparison easier in BFS
260*38fd1498Szrj 	// approach.
261*38fd1498Szrj 	__tmp._M_next = __next;
262*38fd1498Szrj 	__tmp._M_alt = __alt;
263*38fd1498Szrj 	__tmp._M_neg = __neg;
264*38fd1498Szrj 	return _M_insert_state(std::move(__tmp));
265*38fd1498Szrj       }
266*38fd1498Szrj 
267*38fd1498Szrj       _StateIdT
268*38fd1498Szrj       _M_insert_matcher(_MatcherT __m)
269*38fd1498Szrj       {
270*38fd1498Szrj 	_StateT __tmp(_S_opcode_match);
271*38fd1498Szrj 	__tmp._M_get_matcher() = std::move(__m);
272*38fd1498Szrj 	return _M_insert_state(std::move(__tmp));
273*38fd1498Szrj       }
274*38fd1498Szrj 
275*38fd1498Szrj       _StateIdT
276*38fd1498Szrj       _M_insert_subexpr_begin()
277*38fd1498Szrj       {
278*38fd1498Szrj 	auto __id = this->_M_subexpr_count++;
279*38fd1498Szrj 	this->_M_paren_stack.push_back(__id);
280*38fd1498Szrj 	_StateT __tmp(_S_opcode_subexpr_begin);
281*38fd1498Szrj 	__tmp._M_subexpr = __id;
282*38fd1498Szrj 	return _M_insert_state(std::move(__tmp));
283*38fd1498Szrj       }
284*38fd1498Szrj 
285*38fd1498Szrj       _StateIdT
286*38fd1498Szrj       _M_insert_subexpr_end()
287*38fd1498Szrj       {
288*38fd1498Szrj 	_StateT __tmp(_S_opcode_subexpr_end);
289*38fd1498Szrj 	__tmp._M_subexpr = this->_M_paren_stack.back();
290*38fd1498Szrj 	this->_M_paren_stack.pop_back();
291*38fd1498Szrj 	return _M_insert_state(std::move(__tmp));
292*38fd1498Szrj       }
293*38fd1498Szrj 
294*38fd1498Szrj       _StateIdT
295*38fd1498Szrj       _M_insert_backref(size_t __index);
296*38fd1498Szrj 
297*38fd1498Szrj       _StateIdT
298*38fd1498Szrj       _M_insert_line_begin()
299*38fd1498Szrj       { return _M_insert_state(_StateT(_S_opcode_line_begin_assertion)); }
300*38fd1498Szrj 
301*38fd1498Szrj       _StateIdT
302*38fd1498Szrj       _M_insert_line_end()
303*38fd1498Szrj       { return _M_insert_state(_StateT(_S_opcode_line_end_assertion)); }
304*38fd1498Szrj 
305*38fd1498Szrj       _StateIdT
306*38fd1498Szrj       _M_insert_word_bound(bool __neg)
307*38fd1498Szrj       {
308*38fd1498Szrj 	_StateT __tmp(_S_opcode_word_boundary);
309*38fd1498Szrj 	__tmp._M_neg = __neg;
310*38fd1498Szrj 	return _M_insert_state(std::move(__tmp));
311*38fd1498Szrj       }
312*38fd1498Szrj 
313*38fd1498Szrj       _StateIdT
314*38fd1498Szrj       _M_insert_lookahead(_StateIdT __alt, bool __neg)
315*38fd1498Szrj       {
316*38fd1498Szrj 	_StateT __tmp(_S_opcode_subexpr_lookahead);
317*38fd1498Szrj 	__tmp._M_alt = __alt;
318*38fd1498Szrj 	__tmp._M_neg = __neg;
319*38fd1498Szrj 	return _M_insert_state(std::move(__tmp));
320*38fd1498Szrj       }
321*38fd1498Szrj 
322*38fd1498Szrj       _StateIdT
323*38fd1498Szrj       _M_insert_dummy()
324*38fd1498Szrj       { return _M_insert_state(_StateT(_S_opcode_dummy)); }
325*38fd1498Szrj 
326*38fd1498Szrj       _StateIdT
327*38fd1498Szrj       _M_insert_state(_StateT __s)
328*38fd1498Szrj       {
329*38fd1498Szrj 	this->push_back(std::move(__s));
330*38fd1498Szrj 	if (this->size() > _GLIBCXX_REGEX_STATE_LIMIT)
331*38fd1498Szrj 	  __throw_regex_error(
332*38fd1498Szrj 	    regex_constants::error_space,
333*38fd1498Szrj 	    "Number of NFA states exceeds limit. Please use shorter regex "
334*38fd1498Szrj 	    "string, or use smaller brace expression, or make "
335*38fd1498Szrj 	    "_GLIBCXX_REGEX_STATE_LIMIT larger.");
336*38fd1498Szrj 	return this->size() - 1;
337*38fd1498Szrj       }
338*38fd1498Szrj 
339*38fd1498Szrj       // Eliminate dummy node in this NFA to make it compact.
340*38fd1498Szrj       void
341*38fd1498Szrj       _M_eliminate_dummy();
342*38fd1498Szrj 
343*38fd1498Szrj #ifdef _GLIBCXX_DEBUG
344*38fd1498Szrj       std::ostream&
345*38fd1498Szrj       _M_dot(std::ostream& __ostr) const;
346*38fd1498Szrj #endif
347*38fd1498Szrj     public:
348*38fd1498Szrj       _TraitsT                  _M_traits;
349*38fd1498Szrj     };
350*38fd1498Szrj 
351*38fd1498Szrj   /// Describes a sequence of one or more %_State, its current start
352*38fd1498Szrj   /// and end(s).  This structure contains fragments of an NFA during
353*38fd1498Szrj   /// construction.
354*38fd1498Szrj   template<typename _TraitsT>
355*38fd1498Szrj     class _StateSeq
356*38fd1498Szrj     {
357*38fd1498Szrj     public:
358*38fd1498Szrj       typedef _NFA<_TraitsT> _RegexT;
359*38fd1498Szrj 
360*38fd1498Szrj     public:
361*38fd1498Szrj       _StateSeq(_RegexT& __nfa, _StateIdT __s)
362*38fd1498Szrj       : _M_nfa(__nfa), _M_start(__s), _M_end(__s)
363*38fd1498Szrj       { }
364*38fd1498Szrj 
365*38fd1498Szrj       _StateSeq(_RegexT& __nfa, _StateIdT __s, _StateIdT __end)
366*38fd1498Szrj       : _M_nfa(__nfa), _M_start(__s), _M_end(__end)
367*38fd1498Szrj       { }
368*38fd1498Szrj 
369*38fd1498Szrj       // Append a state on *this and change *this to the new sequence.
370*38fd1498Szrj       void
371*38fd1498Szrj       _M_append(_StateIdT __id)
372*38fd1498Szrj       {
373*38fd1498Szrj 	_M_nfa[_M_end]._M_next = __id;
374*38fd1498Szrj 	_M_end = __id;
375*38fd1498Szrj       }
376*38fd1498Szrj 
377*38fd1498Szrj       // Append a sequence on *this and change *this to the new sequence.
378*38fd1498Szrj       void
379*38fd1498Szrj       _M_append(const _StateSeq& __s)
380*38fd1498Szrj       {
381*38fd1498Szrj 	_M_nfa[_M_end]._M_next = __s._M_start;
382*38fd1498Szrj 	_M_end = __s._M_end;
383*38fd1498Szrj       }
384*38fd1498Szrj 
385*38fd1498Szrj       // Clones an entire sequence.
386*38fd1498Szrj       _StateSeq
387*38fd1498Szrj       _M_clone();
388*38fd1498Szrj 
389*38fd1498Szrj     public:
390*38fd1498Szrj       _RegexT&  _M_nfa;
391*38fd1498Szrj       _StateIdT _M_start;
392*38fd1498Szrj       _StateIdT _M_end;
393*38fd1498Szrj     };
394*38fd1498Szrj 
395*38fd1498Szrj  //@} regex-detail
396*38fd1498Szrj } // namespace __detail
397*38fd1498Szrj 
398*38fd1498Szrj _GLIBCXX_END_NAMESPACE_VERSION
399*38fd1498Szrj } // namespace std
400*38fd1498Szrj 
401*38fd1498Szrj #include <bits/regex_automaton.tcc>
402