1*38fd1498Szrj // class template regex -*- C++ -*-
2*38fd1498Szrj
3*38fd1498Szrj // Copyright (C) 2013-2018 Free Software Foundation, Inc.
4*38fd1498Szrj //
5*38fd1498Szrj // This file is part of the GNU ISO C++ Library. This library is free
6*38fd1498Szrj // software; you can redistribute it and/or modify it under the
7*38fd1498Szrj // terms of the GNU General Public License as published by the
8*38fd1498Szrj // Free Software Foundation; either version 3, or (at your option)
9*38fd1498Szrj // any later version.
10*38fd1498Szrj
11*38fd1498Szrj // This library is distributed in the hope that it will be useful,
12*38fd1498Szrj // but WITHOUT ANY WARRANTY; without even the implied warranty of
13*38fd1498Szrj // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14*38fd1498Szrj // GNU General Public License for more details.
15*38fd1498Szrj
16*38fd1498Szrj // Under Section 7 of GPL version 3, you are granted additional
17*38fd1498Szrj // permissions described in the GCC Runtime Library Exception, version
18*38fd1498Szrj // 3.1, as published by the Free Software Foundation.
19*38fd1498Szrj
20*38fd1498Szrj // You should have received a copy of the GNU General Public License and
21*38fd1498Szrj // a copy of the GCC Runtime Library Exception along with this program;
22*38fd1498Szrj // see the files COPYING3 and COPYING.RUNTIME respectively. If not, see
23*38fd1498Szrj // <http://www.gnu.org/licenses/>.
24*38fd1498Szrj
25*38fd1498Szrj /**
26*38fd1498Szrj * @file bits/regex_automaton.h
27*38fd1498Szrj * This is an internal header file, included by other library headers.
28*38fd1498Szrj * Do not attempt to use it directly. @headername{regex}
29*38fd1498Szrj */
30*38fd1498Szrj
31*38fd1498Szrj // This macro defines the maximal state number a NFA can have.
32*38fd1498Szrj #ifndef _GLIBCXX_REGEX_STATE_LIMIT
33*38fd1498Szrj #define _GLIBCXX_REGEX_STATE_LIMIT 100000
34*38fd1498Szrj #endif
35*38fd1498Szrj
_GLIBCXX_VISIBILITY(default)36*38fd1498Szrj namespace std _GLIBCXX_VISIBILITY(default)
37*38fd1498Szrj {
38*38fd1498Szrj _GLIBCXX_BEGIN_NAMESPACE_VERSION
39*38fd1498Szrj
40*38fd1498Szrj namespace __detail
41*38fd1498Szrj {
42*38fd1498Szrj /**
43*38fd1498Szrj * @defgroup regex-detail Base and Implementation Classes
44*38fd1498Szrj * @ingroup regex
45*38fd1498Szrj * @{
46*38fd1498Szrj */
47*38fd1498Szrj
48*38fd1498Szrj typedef long _StateIdT;
49*38fd1498Szrj static const _StateIdT _S_invalid_state_id = -1;
50*38fd1498Szrj
51*38fd1498Szrj template<typename _CharT>
52*38fd1498Szrj using _Matcher = std::function<bool (_CharT)>;
53*38fd1498Szrj
54*38fd1498Szrj /// Operation codes that define the type of transitions within the base NFA
55*38fd1498Szrj /// that represents the regular expression.
56*38fd1498Szrj enum _Opcode : int
57*38fd1498Szrj {
58*38fd1498Szrj _S_opcode_unknown,
59*38fd1498Szrj _S_opcode_alternative,
60*38fd1498Szrj _S_opcode_repeat,
61*38fd1498Szrj _S_opcode_backref,
62*38fd1498Szrj _S_opcode_line_begin_assertion,
63*38fd1498Szrj _S_opcode_line_end_assertion,
64*38fd1498Szrj _S_opcode_word_boundary,
65*38fd1498Szrj _S_opcode_subexpr_lookahead,
66*38fd1498Szrj _S_opcode_subexpr_begin,
67*38fd1498Szrj _S_opcode_subexpr_end,
68*38fd1498Szrj _S_opcode_dummy,
69*38fd1498Szrj _S_opcode_match,
70*38fd1498Szrj _S_opcode_accept,
71*38fd1498Szrj };
72*38fd1498Szrj
73*38fd1498Szrj struct _State_base
74*38fd1498Szrj {
75*38fd1498Szrj protected:
76*38fd1498Szrj _Opcode _M_opcode; // type of outgoing transition
77*38fd1498Szrj
78*38fd1498Szrj public:
79*38fd1498Szrj _StateIdT _M_next; // outgoing transition
80*38fd1498Szrj union // Since they are mutually exclusive.
81*38fd1498Szrj {
82*38fd1498Szrj size_t _M_subexpr; // for _S_opcode_subexpr_*
83*38fd1498Szrj size_t _M_backref_index; // for _S_opcode_backref
84*38fd1498Szrj struct
85*38fd1498Szrj {
86*38fd1498Szrj // for _S_opcode_alternative, _S_opcode_repeat and
87*38fd1498Szrj // _S_opcode_subexpr_lookahead
88*38fd1498Szrj _StateIdT _M_alt;
89*38fd1498Szrj // for _S_opcode_word_boundary or _S_opcode_subexpr_lookahead or
90*38fd1498Szrj // quantifiers (ungreedy if set true)
91*38fd1498Szrj bool _M_neg;
92*38fd1498Szrj };
93*38fd1498Szrj // For _S_opcode_match
94*38fd1498Szrj __gnu_cxx::__aligned_membuf<_Matcher<char>> _M_matcher_storage;
95*38fd1498Szrj };
96*38fd1498Szrj
97*38fd1498Szrj protected:
98*38fd1498Szrj explicit _State_base(_Opcode __opcode)
99*38fd1498Szrj : _M_opcode(__opcode), _M_next(_S_invalid_state_id)
100*38fd1498Szrj { }
101*38fd1498Szrj
102*38fd1498Szrj public:
103*38fd1498Szrj bool
104*38fd1498Szrj _M_has_alt()
105*38fd1498Szrj {
106*38fd1498Szrj return _M_opcode == _S_opcode_alternative
107*38fd1498Szrj || _M_opcode == _S_opcode_repeat
108*38fd1498Szrj || _M_opcode == _S_opcode_subexpr_lookahead;
109*38fd1498Szrj }
110*38fd1498Szrj
111*38fd1498Szrj #ifdef _GLIBCXX_DEBUG
112*38fd1498Szrj std::ostream&
113*38fd1498Szrj _M_print(std::ostream& ostr) const;
114*38fd1498Szrj
115*38fd1498Szrj // Prints graphviz dot commands for state.
116*38fd1498Szrj std::ostream&
117*38fd1498Szrj _M_dot(std::ostream& __ostr, _StateIdT __id) const;
118*38fd1498Szrj #endif
119*38fd1498Szrj };
120*38fd1498Szrj
121*38fd1498Szrj template<typename _Char_type>
122*38fd1498Szrj struct _State : _State_base
123*38fd1498Szrj {
124*38fd1498Szrj typedef _Matcher<_Char_type> _MatcherT;
125*38fd1498Szrj static_assert(sizeof(_MatcherT) == sizeof(_Matcher<char>),
126*38fd1498Szrj "std::function<bool(T)> has the same size as "
127*38fd1498Szrj "std::function<bool(char)>");
128*38fd1498Szrj static_assert(alignof(_MatcherT) == alignof(_Matcher<char>),
129*38fd1498Szrj "std::function<bool(T)> has the same alignment as "
130*38fd1498Szrj "std::function<bool(char)>");
131*38fd1498Szrj
132*38fd1498Szrj explicit
133*38fd1498Szrj _State(_Opcode __opcode) : _State_base(__opcode)
134*38fd1498Szrj {
135*38fd1498Szrj if (_M_opcode() == _S_opcode_match)
136*38fd1498Szrj new (this->_M_matcher_storage._M_addr()) _MatcherT();
137*38fd1498Szrj }
138*38fd1498Szrj
139*38fd1498Szrj _State(const _State& __rhs) : _State_base(__rhs)
140*38fd1498Szrj {
141*38fd1498Szrj if (__rhs._M_opcode() == _S_opcode_match)
142*38fd1498Szrj new (this->_M_matcher_storage._M_addr())
143*38fd1498Szrj _MatcherT(__rhs._M_get_matcher());
144*38fd1498Szrj }
145*38fd1498Szrj
146*38fd1498Szrj _State(_State&& __rhs) : _State_base(__rhs)
147*38fd1498Szrj {
148*38fd1498Szrj if (__rhs._M_opcode() == _S_opcode_match)
149*38fd1498Szrj new (this->_M_matcher_storage._M_addr())
150*38fd1498Szrj _MatcherT(std::move(__rhs._M_get_matcher()));
151*38fd1498Szrj }
152*38fd1498Szrj
153*38fd1498Szrj _State&
154*38fd1498Szrj operator=(const _State&) = delete;
155*38fd1498Szrj
156*38fd1498Szrj ~_State()
157*38fd1498Szrj {
158*38fd1498Szrj if (_M_opcode() == _S_opcode_match)
159*38fd1498Szrj _M_get_matcher().~_MatcherT();
160*38fd1498Szrj }
161*38fd1498Szrj
162*38fd1498Szrj // Since correct ctor and dtor rely on _M_opcode, it's better not to
163*38fd1498Szrj // change it over time.
164*38fd1498Szrj _Opcode
165*38fd1498Szrj _M_opcode() const
166*38fd1498Szrj { return _State_base::_M_opcode; }
167*38fd1498Szrj
168*38fd1498Szrj bool
169*38fd1498Szrj _M_matches(_Char_type __char) const
170*38fd1498Szrj { return _M_get_matcher()(__char); }
171*38fd1498Szrj
172*38fd1498Szrj _MatcherT&
173*38fd1498Szrj _M_get_matcher()
174*38fd1498Szrj { return *static_cast<_MatcherT*>(this->_M_matcher_storage._M_addr()); }
175*38fd1498Szrj
176*38fd1498Szrj const _MatcherT&
177*38fd1498Szrj _M_get_matcher() const
178*38fd1498Szrj {
179*38fd1498Szrj return *static_cast<const _MatcherT*>(
180*38fd1498Szrj this->_M_matcher_storage._M_addr());
181*38fd1498Szrj }
182*38fd1498Szrj };
183*38fd1498Szrj
184*38fd1498Szrj struct _NFA_base
185*38fd1498Szrj {
186*38fd1498Szrj typedef size_t _SizeT;
187*38fd1498Szrj typedef regex_constants::syntax_option_type _FlagT;
188*38fd1498Szrj
189*38fd1498Szrj explicit
190*38fd1498Szrj _NFA_base(_FlagT __f)
191*38fd1498Szrj : _M_flags(__f), _M_start_state(0), _M_subexpr_count(0),
192*38fd1498Szrj _M_has_backref(false)
193*38fd1498Szrj { }
194*38fd1498Szrj
195*38fd1498Szrj _NFA_base(_NFA_base&&) = default;
196*38fd1498Szrj
197*38fd1498Szrj protected:
198*38fd1498Szrj ~_NFA_base() = default;
199*38fd1498Szrj
200*38fd1498Szrj public:
201*38fd1498Szrj _FlagT
202*38fd1498Szrj _M_options() const
203*38fd1498Szrj { return _M_flags; }
204*38fd1498Szrj
205*38fd1498Szrj _StateIdT
206*38fd1498Szrj _M_start() const
207*38fd1498Szrj { return _M_start_state; }
208*38fd1498Szrj
209*38fd1498Szrj _SizeT
210*38fd1498Szrj _M_sub_count() const
211*38fd1498Szrj { return _M_subexpr_count; }
212*38fd1498Szrj
213*38fd1498Szrj std::vector<size_t> _M_paren_stack;
214*38fd1498Szrj _FlagT _M_flags;
215*38fd1498Szrj _StateIdT _M_start_state;
216*38fd1498Szrj _SizeT _M_subexpr_count;
217*38fd1498Szrj bool _M_has_backref;
218*38fd1498Szrj };
219*38fd1498Szrj
220*38fd1498Szrj template<typename _TraitsT>
221*38fd1498Szrj struct _NFA
222*38fd1498Szrj : _NFA_base, std::vector<_State<typename _TraitsT::char_type>>
223*38fd1498Szrj {
224*38fd1498Szrj typedef typename _TraitsT::char_type _Char_type;
225*38fd1498Szrj typedef _State<_Char_type> _StateT;
226*38fd1498Szrj typedef _Matcher<_Char_type> _MatcherT;
227*38fd1498Szrj
228*38fd1498Szrj _NFA(const typename _TraitsT::locale_type& __loc, _FlagT __flags)
229*38fd1498Szrj : _NFA_base(__flags)
230*38fd1498Szrj { _M_traits.imbue(__loc); }
231*38fd1498Szrj
232*38fd1498Szrj // for performance reasons _NFA objects should only be moved not copied
233*38fd1498Szrj _NFA(const _NFA&) = delete;
234*38fd1498Szrj _NFA(_NFA&&) = default;
235*38fd1498Szrj
236*38fd1498Szrj _StateIdT
237*38fd1498Szrj _M_insert_accept()
238*38fd1498Szrj {
239*38fd1498Szrj auto __ret = _M_insert_state(_StateT(_S_opcode_accept));
240*38fd1498Szrj return __ret;
241*38fd1498Szrj }
242*38fd1498Szrj
243*38fd1498Szrj _StateIdT
244*38fd1498Szrj _M_insert_alt(_StateIdT __next, _StateIdT __alt,
245*38fd1498Szrj bool __neg __attribute__((__unused__)))
246*38fd1498Szrj {
247*38fd1498Szrj _StateT __tmp(_S_opcode_alternative);
248*38fd1498Szrj // It labels every quantifier to make greedy comparison easier in BFS
249*38fd1498Szrj // approach.
250*38fd1498Szrj __tmp._M_next = __next;
251*38fd1498Szrj __tmp._M_alt = __alt;
252*38fd1498Szrj return _M_insert_state(std::move(__tmp));
253*38fd1498Szrj }
254*38fd1498Szrj
255*38fd1498Szrj _StateIdT
256*38fd1498Szrj _M_insert_repeat(_StateIdT __next, _StateIdT __alt, bool __neg)
257*38fd1498Szrj {
258*38fd1498Szrj _StateT __tmp(_S_opcode_repeat);
259*38fd1498Szrj // It labels every quantifier to make greedy comparison easier in BFS
260*38fd1498Szrj // approach.
261*38fd1498Szrj __tmp._M_next = __next;
262*38fd1498Szrj __tmp._M_alt = __alt;
263*38fd1498Szrj __tmp._M_neg = __neg;
264*38fd1498Szrj return _M_insert_state(std::move(__tmp));
265*38fd1498Szrj }
266*38fd1498Szrj
267*38fd1498Szrj _StateIdT
268*38fd1498Szrj _M_insert_matcher(_MatcherT __m)
269*38fd1498Szrj {
270*38fd1498Szrj _StateT __tmp(_S_opcode_match);
271*38fd1498Szrj __tmp._M_get_matcher() = std::move(__m);
272*38fd1498Szrj return _M_insert_state(std::move(__tmp));
273*38fd1498Szrj }
274*38fd1498Szrj
275*38fd1498Szrj _StateIdT
276*38fd1498Szrj _M_insert_subexpr_begin()
277*38fd1498Szrj {
278*38fd1498Szrj auto __id = this->_M_subexpr_count++;
279*38fd1498Szrj this->_M_paren_stack.push_back(__id);
280*38fd1498Szrj _StateT __tmp(_S_opcode_subexpr_begin);
281*38fd1498Szrj __tmp._M_subexpr = __id;
282*38fd1498Szrj return _M_insert_state(std::move(__tmp));
283*38fd1498Szrj }
284*38fd1498Szrj
285*38fd1498Szrj _StateIdT
286*38fd1498Szrj _M_insert_subexpr_end()
287*38fd1498Szrj {
288*38fd1498Szrj _StateT __tmp(_S_opcode_subexpr_end);
289*38fd1498Szrj __tmp._M_subexpr = this->_M_paren_stack.back();
290*38fd1498Szrj this->_M_paren_stack.pop_back();
291*38fd1498Szrj return _M_insert_state(std::move(__tmp));
292*38fd1498Szrj }
293*38fd1498Szrj
294*38fd1498Szrj _StateIdT
295*38fd1498Szrj _M_insert_backref(size_t __index);
296*38fd1498Szrj
297*38fd1498Szrj _StateIdT
298*38fd1498Szrj _M_insert_line_begin()
299*38fd1498Szrj { return _M_insert_state(_StateT(_S_opcode_line_begin_assertion)); }
300*38fd1498Szrj
301*38fd1498Szrj _StateIdT
302*38fd1498Szrj _M_insert_line_end()
303*38fd1498Szrj { return _M_insert_state(_StateT(_S_opcode_line_end_assertion)); }
304*38fd1498Szrj
305*38fd1498Szrj _StateIdT
306*38fd1498Szrj _M_insert_word_bound(bool __neg)
307*38fd1498Szrj {
308*38fd1498Szrj _StateT __tmp(_S_opcode_word_boundary);
309*38fd1498Szrj __tmp._M_neg = __neg;
310*38fd1498Szrj return _M_insert_state(std::move(__tmp));
311*38fd1498Szrj }
312*38fd1498Szrj
313*38fd1498Szrj _StateIdT
314*38fd1498Szrj _M_insert_lookahead(_StateIdT __alt, bool __neg)
315*38fd1498Szrj {
316*38fd1498Szrj _StateT __tmp(_S_opcode_subexpr_lookahead);
317*38fd1498Szrj __tmp._M_alt = __alt;
318*38fd1498Szrj __tmp._M_neg = __neg;
319*38fd1498Szrj return _M_insert_state(std::move(__tmp));
320*38fd1498Szrj }
321*38fd1498Szrj
322*38fd1498Szrj _StateIdT
323*38fd1498Szrj _M_insert_dummy()
324*38fd1498Szrj { return _M_insert_state(_StateT(_S_opcode_dummy)); }
325*38fd1498Szrj
326*38fd1498Szrj _StateIdT
327*38fd1498Szrj _M_insert_state(_StateT __s)
328*38fd1498Szrj {
329*38fd1498Szrj this->push_back(std::move(__s));
330*38fd1498Szrj if (this->size() > _GLIBCXX_REGEX_STATE_LIMIT)
331*38fd1498Szrj __throw_regex_error(
332*38fd1498Szrj regex_constants::error_space,
333*38fd1498Szrj "Number of NFA states exceeds limit. Please use shorter regex "
334*38fd1498Szrj "string, or use smaller brace expression, or make "
335*38fd1498Szrj "_GLIBCXX_REGEX_STATE_LIMIT larger.");
336*38fd1498Szrj return this->size() - 1;
337*38fd1498Szrj }
338*38fd1498Szrj
339*38fd1498Szrj // Eliminate dummy node in this NFA to make it compact.
340*38fd1498Szrj void
341*38fd1498Szrj _M_eliminate_dummy();
342*38fd1498Szrj
343*38fd1498Szrj #ifdef _GLIBCXX_DEBUG
344*38fd1498Szrj std::ostream&
345*38fd1498Szrj _M_dot(std::ostream& __ostr) const;
346*38fd1498Szrj #endif
347*38fd1498Szrj public:
348*38fd1498Szrj _TraitsT _M_traits;
349*38fd1498Szrj };
350*38fd1498Szrj
351*38fd1498Szrj /// Describes a sequence of one or more %_State, its current start
352*38fd1498Szrj /// and end(s). This structure contains fragments of an NFA during
353*38fd1498Szrj /// construction.
354*38fd1498Szrj template<typename _TraitsT>
355*38fd1498Szrj class _StateSeq
356*38fd1498Szrj {
357*38fd1498Szrj public:
358*38fd1498Szrj typedef _NFA<_TraitsT> _RegexT;
359*38fd1498Szrj
360*38fd1498Szrj public:
361*38fd1498Szrj _StateSeq(_RegexT& __nfa, _StateIdT __s)
362*38fd1498Szrj : _M_nfa(__nfa), _M_start(__s), _M_end(__s)
363*38fd1498Szrj { }
364*38fd1498Szrj
365*38fd1498Szrj _StateSeq(_RegexT& __nfa, _StateIdT __s, _StateIdT __end)
366*38fd1498Szrj : _M_nfa(__nfa), _M_start(__s), _M_end(__end)
367*38fd1498Szrj { }
368*38fd1498Szrj
369*38fd1498Szrj // Append a state on *this and change *this to the new sequence.
370*38fd1498Szrj void
371*38fd1498Szrj _M_append(_StateIdT __id)
372*38fd1498Szrj {
373*38fd1498Szrj _M_nfa[_M_end]._M_next = __id;
374*38fd1498Szrj _M_end = __id;
375*38fd1498Szrj }
376*38fd1498Szrj
377*38fd1498Szrj // Append a sequence on *this and change *this to the new sequence.
378*38fd1498Szrj void
379*38fd1498Szrj _M_append(const _StateSeq& __s)
380*38fd1498Szrj {
381*38fd1498Szrj _M_nfa[_M_end]._M_next = __s._M_start;
382*38fd1498Szrj _M_end = __s._M_end;
383*38fd1498Szrj }
384*38fd1498Szrj
385*38fd1498Szrj // Clones an entire sequence.
386*38fd1498Szrj _StateSeq
387*38fd1498Szrj _M_clone();
388*38fd1498Szrj
389*38fd1498Szrj public:
390*38fd1498Szrj _RegexT& _M_nfa;
391*38fd1498Szrj _StateIdT _M_start;
392*38fd1498Szrj _StateIdT _M_end;
393*38fd1498Szrj };
394*38fd1498Szrj
395*38fd1498Szrj //@} regex-detail
396*38fd1498Szrj } // namespace __detail
397*38fd1498Szrj
398*38fd1498Szrj _GLIBCXX_END_NAMESPACE_VERSION
399*38fd1498Szrj } // namespace std
400*38fd1498Szrj
401*38fd1498Szrj #include <bits/regex_automaton.tcc>
402