xref: /netbsd-src/external/gpl3/gcc/dist/libstdc++-v3/include/bits/regex_executor.h (revision 0a3071956a3a9fdebdbf7f338cf2d439b45fc728)
1 // class template regex -*- C++ -*-
2 
3 // Copyright (C) 2013-2022 Free Software Foundation, Inc.
4 //
5 // This file is part of the GNU ISO C++ Library.  This library is free
6 // software; you can redistribute it and/or modify it under the
7 // terms of the GNU General Public License as published by the
8 // Free Software Foundation; either version 3, or (at your option)
9 // any later version.
10 
11 // This library is distributed in the hope that it will be useful,
12 // but WITHOUT ANY WARRANTY; without even the implied warranty of
13 // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
14 // GNU General Public License for more details.
15 
16 // Under Section 7 of GPL version 3, you are granted additional
17 // permissions described in the GCC Runtime Library Exception, version
18 // 3.1, as published by the Free Software Foundation.
19 
20 // You should have received a copy of the GNU General Public License and
21 // a copy of the GCC Runtime Library Exception along with this program;
22 // see the files COPYING3 and COPYING.RUNTIME respectively.  If not, see
23 // <http://www.gnu.org/licenses/>.
24 
25 /**
26  *  @file bits/regex_executor.h
27  *  This is an internal header file, included by other library headers.
28  *  Do not attempt to use it directly. @headername{regex}
29  */
30 
31 // FIXME convert comments to doxygen format.
32 
_GLIBCXX_VISIBILITY(default)33 namespace std _GLIBCXX_VISIBILITY(default)
34 {
35 _GLIBCXX_BEGIN_NAMESPACE_VERSION
36 
37 namespace __detail
38 {
39   /**
40    * @addtogroup regex-detail
41    * @{
42    */
43 
44   /**
45    * @brief Takes a regex and an input string and does the matching.
46    *
47    * The %_Executor class has two modes: DFS mode and BFS mode, controlled
48    * by the template parameter %__dfs_mode.
49    */
50   template<typename _BiIter, typename _Alloc, typename _TraitsT,
51 	   bool __dfs_mode>
52     class _Executor
53     {
54       using __search_mode = integral_constant<bool, __dfs_mode>;
55       using __dfs = true_type;
56       using __bfs = false_type;
57 
58       enum class _Match_mode : unsigned char { _Exact, _Prefix };
59 
60     public:
61       typedef typename iterator_traits<_BiIter>::value_type _CharT;
62       typedef basic_regex<_CharT, _TraitsT>                 _RegexT;
63       typedef _GLIBCXX_STD_C::vector<sub_match<_BiIter>, _Alloc> _ResultsVec;
64       typedef regex_constants::match_flag_type              _FlagT;
65       typedef typename _TraitsT::char_class_type            _ClassT;
66       typedef _NFA<_TraitsT>                                _NFAT;
67 
68     public:
69       _Executor(_BiIter         __begin,
70 		_BiIter         __end,
71 		_ResultsVec&    __results,
72 		const _RegexT&  __re,
73 		_FlagT          __flags)
74       : _M_cur_results(__results.get_allocator()),
75 	_M_begin(__begin),
76 	_M_end(__end),
77 	_M_re(__re),
78 	_M_nfa(*__re._M_automaton),
79 	_M_results(__results),
80 	_M_rep_count(_M_nfa.size()),
81 	_M_states(_M_nfa._M_start(), _M_nfa.size()),
82 	_M_flags(__flags)
83       {
84 	using namespace regex_constants;
85 	if (__flags & match_prev_avail) // ignore not_bol and not_bow
86 	  _M_flags &= ~(match_not_bol | match_not_bow);
87       }
88 
89       // Set matched when string exactly matches the pattern.
90       bool
91       _M_match()
92       {
93 	_M_current = _M_begin;
94 	return _M_main(_Match_mode::_Exact);
95       }
96 
97       // Set matched when some prefix of the string matches the pattern.
98       bool
99       _M_search_from_first()
100       {
101 	_M_current = _M_begin;
102 	return _M_main(_Match_mode::_Prefix);
103       }
104 
105       bool
106       _M_search();
107 
108     private:
109       void
110       _M_rep_once_more(_Match_mode __match_mode, _StateIdT);
111 
112       void
113       _M_handle_repeat(_Match_mode, _StateIdT);
114 
115       void
116       _M_handle_subexpr_begin(_Match_mode, _StateIdT);
117 
118       void
119       _M_handle_subexpr_end(_Match_mode, _StateIdT);
120 
121       void
122       _M_handle_line_begin_assertion(_Match_mode, _StateIdT);
123 
124       void
125       _M_handle_line_end_assertion(_Match_mode, _StateIdT);
126 
127       void
128       _M_handle_word_boundary(_Match_mode, _StateIdT);
129 
130       void
131       _M_handle_subexpr_lookahead(_Match_mode, _StateIdT);
132 
133       void
134       _M_handle_match(_Match_mode, _StateIdT);
135 
136       void
137       _M_handle_backref(_Match_mode, _StateIdT);
138 
139       void
140       _M_handle_accept(_Match_mode, _StateIdT);
141 
142       void
143       _M_handle_alternative(_Match_mode, _StateIdT);
144 
145       void
146       _M_dfs(_Match_mode __match_mode, _StateIdT __start);
147 
148       bool
149       _M_main(_Match_mode __match_mode)
150       { return _M_main_dispatch(__match_mode, __search_mode{}); }
151 
152       bool
153       _M_main_dispatch(_Match_mode __match_mode, __dfs);
154 
155       bool
156       _M_main_dispatch(_Match_mode __match_mode, __bfs);
157 
158       bool
159       _M_is_word(_CharT __ch) const
160       {
161 	static const _CharT __s[2] = { 'w' };
162 	return _M_re._M_automaton->_M_traits.isctype
163 	  (__ch, _M_re._M_automaton->_M_traits.lookup_classname(__s, __s+1));
164       }
165 
166       bool
167       _M_at_begin() const
168       {
169 	if (_M_current == _M_begin)
170 	  {
171 	    // match_not_bol means ^ does not match [_M_begin,_M_begin)
172 	    if (_M_flags & regex_constants::match_not_bol)
173 	      return false;
174 	    // match_prev_avail means _M_begin is not the start of the input.
175 	    if (_M_flags & regex_constants::match_prev_avail)
176 	      {
177 		// For ECMAScript multiline matches, check if the previous
178 		// character is a line terminator.
179 		if (_M_match_multiline())
180 		  return _M_is_line_terminator(*std::prev(_M_current));
181 		else
182 		  return false;
183 	      }
184 	    else // ^ matches at _M_begin
185 	      return true;
186 	  }
187 	else if (_M_match_multiline())
188 	  return _M_is_line_terminator(*std::prev(_M_current));
189 	else
190 	  return false;
191       }
192 
193       bool
194       _M_at_end() const
195       {
196 	if (_M_current == _M_end)
197 	  return !(_M_flags & regex_constants::match_not_eol);
198 	else if (_M_match_multiline())
199 	  return _M_is_line_terminator(*_M_current);
200 	else
201 	  return false;
202       }
203 
204       bool
205       _M_word_boundary() const;
206 
207       bool
208       _M_lookahead(_StateIdT __next);
209 
210       bool
211       _M_is_line_terminator(_CharT __c) const
212       {
213 	const auto& __traits = _M_re._M_automaton->_M_traits;
214 	const auto& __ct = use_facet<ctype<_CharT>>(__traits.getloc());
215 	const char __n{ __ct.narrow(__c, ' ') };
216 	if (__n == '\n')
217 	  return true;
218 	if (_M_re._M_automaton->_M_options() & regex_constants::ECMAScript)
219 	  {
220 	    if (__n == '\r')
221 	      return true;
222 	    // FIXME: U+2028 (line separator) and U+2029 (paragraph separator)
223 	  }
224 	return false;
225       }
226 
227       bool
228       _M_match_multiline() const noexcept
229       {
230 	constexpr auto __m
231 	  = regex_constants::ECMAScript | regex_constants::__multiline;
232 	return (_M_re._M_automaton->_M_options() & __m) == __m;
233       }
234 
235        // Holds additional information used in BFS-mode.
236       template<typename _SearchMode, typename _ResultsVec>
237 	struct _State_info;
238 
239       template<typename _ResultsVec>
240 	struct _State_info<__bfs, _ResultsVec>
241 	{
242 	  explicit
243 	  _State_info(_StateIdT __start, size_t __n)
244 	  : _M_visited_states(new bool[__n]()), _M_start(__start)
245 	  { }
246 
247 	  ~_State_info() { delete[] _M_visited_states; }
248 
249 	  _State_info(const _State_info&) = delete;
250 	  _State_info& operator=(const _State_info&) = delete;
251 
252 	  bool _M_visited(_StateIdT __i)
253 	  {
254 	    if (_M_visited_states[__i])
255 	      return true;
256 	    _M_visited_states[__i] = true;
257 	    return false;
258 	  }
259 
260 	  void _M_queue(_StateIdT __i, const _ResultsVec& __res)
261 	  { _M_match_queue.emplace_back(__i, __res); }
262 
263 	  // Dummy implementations for BFS mode.
264 	  _BiIter* _M_get_sol_pos() { return nullptr; }
265 
266 	  // Saves states that need to be considered for the next character.
267 	  _GLIBCXX_STD_C::vector<pair<_StateIdT, _ResultsVec>> _M_match_queue;
268 	  // Indicates which states are already visited.
269 	  bool*     _M_visited_states;
270 	  // To record current solution.
271 	  _StateIdT _M_start;
272 	};
273 
274       template<typename _ResultsVec>
275 	struct _State_info<__dfs, _ResultsVec>
276 	{
277 	  explicit
278 	  _State_info(_StateIdT __start, size_t) : _M_start(__start)
279 	  { }
280 
281 	  // Dummy implementations for DFS mode.
282 	  bool _M_visited(_StateIdT) const { return false; }
283 	  void _M_queue(_StateIdT, const _ResultsVec&) { }
284 
285 	  _BiIter* _M_get_sol_pos() { return &_M_sol_pos; }
286 
287 	  // To record current solution.
288 	  _StateIdT _M_start;
289 	  _BiIter   _M_sol_pos;
290 	};
291 
292     public:
293       _ResultsVec                                           _M_cur_results;
294       _BiIter                                               _M_current;
295       _BiIter                                               _M_begin;
296       const _BiIter                                         _M_end;
297       const _RegexT&                                        _M_re;
298       const _NFAT&                                          _M_nfa;
299       _ResultsVec&                                          _M_results;
300       _GLIBCXX_STD_C::vector<pair<_BiIter, int>>            _M_rep_count;
301       _State_info<__search_mode, _ResultsVec>		    _M_states;
302       _FlagT                                                _M_flags;
303       // Do we have a solution so far?
304       bool                                                  _M_has_sol;
305     };
306 
307  ///@} regex-detail
308 } // namespace __detail
309 _GLIBCXX_END_NAMESPACE_VERSION
310 } // namespace std
311 
312 #include <bits/regex_executor.tcc>
313