xref: /netbsd-src/external/gpl3/gcc/dist/libstdc++-v3/include/bits/regex_automaton.tcc (revision 0a3071956a3a9fdebdbf7f338cf2d439b45fc728)
14d5abbe8Smrg // class template regex -*- C++ -*-
24d5abbe8Smrg 
3b1e83836Smrg // Copyright (C) 2013-2022 Free Software Foundation, Inc.
44d5abbe8Smrg //
54d5abbe8Smrg // This file is part of the GNU ISO C++ Library.  This library is free
64d5abbe8Smrg // software; you can redistribute it and/or modify it under the
74d5abbe8Smrg // terms of the GNU General Public License as published by the
84d5abbe8Smrg // Free Software Foundation; either version 3, or (at your option)
94d5abbe8Smrg // any later version.
104d5abbe8Smrg 
114d5abbe8Smrg // This library is distributed in the hope that it will be useful,
124d5abbe8Smrg // but WITHOUT ANY WARRANTY; without even the implied warranty of
134d5abbe8Smrg // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
144d5abbe8Smrg // GNU General Public License for more details.
154d5abbe8Smrg 
164d5abbe8Smrg // Under Section 7 of GPL version 3, you are granted additional
174d5abbe8Smrg // permissions described in the GCC Runtime Library Exception, version
184d5abbe8Smrg // 3.1, as published by the Free Software Foundation.
194d5abbe8Smrg 
204d5abbe8Smrg // You should have received a copy of the GNU General Public License and
214d5abbe8Smrg // a copy of the GCC Runtime Library Exception along with this program;
224d5abbe8Smrg // see the files COPYING3 and COPYING.RUNTIME respectively.  If not, see
234d5abbe8Smrg // <http://www.gnu.org/licenses/>.
244d5abbe8Smrg 
254d5abbe8Smrg /**
264d5abbe8Smrg  *  @file bits/regex_automaton.tcc
274d5abbe8Smrg  *  This is an internal header file, included by other library headers.
284d5abbe8Smrg  *  Do not attempt to use it directly. @headername{regex}
294d5abbe8Smrg  */
304d5abbe8Smrg 
314d5abbe8Smrg namespace std _GLIBCXX_VISIBILITY(default)
324d5abbe8Smrg {
338b6133e5Smrg _GLIBCXX_BEGIN_NAMESPACE_VERSION
348b6133e5Smrg 
35a3e9eb18Smrg namespace __detail
36a3e9eb18Smrg {
374d5abbe8Smrg #ifdef _GLIBCXX_DEBUG
384d5abbe8Smrg   inline std::ostream&
_M_print(std::ostream & __ostr) const39*0a307195Smrg   _State_base::_M_print(std::ostream& __ostr) const
404d5abbe8Smrg   {
414d5abbe8Smrg     switch (_M_opcode)
424d5abbe8Smrg     {
434d5abbe8Smrg       case _S_opcode_alternative:
444d5abbe8Smrg       case _S_opcode_repeat:
45*0a307195Smrg 	__ostr << "alt next=" << _M_next << " alt=" << _M_alt;
464d5abbe8Smrg 	break;
474d5abbe8Smrg       case _S_opcode_subexpr_begin:
48*0a307195Smrg 	__ostr << "subexpr begin next=" << _M_next << " index=" << _M_subexpr;
494d5abbe8Smrg 	break;
504d5abbe8Smrg       case _S_opcode_subexpr_end:
51*0a307195Smrg 	__ostr << "subexpr end next=" << _M_next << " index=" << _M_subexpr;
524d5abbe8Smrg 	break;
534d5abbe8Smrg       case _S_opcode_backref:
54*0a307195Smrg 	__ostr << "backref next=" << _M_next << " index=" << _M_backref_index;
554d5abbe8Smrg 	break;
564d5abbe8Smrg       case _S_opcode_match:
57*0a307195Smrg 	__ostr << "match next=" << _M_next;
584d5abbe8Smrg 	break;
594d5abbe8Smrg       case _S_opcode_accept:
60*0a307195Smrg 	__ostr << "accept next=" << _M_next;
614d5abbe8Smrg 	break;
624d5abbe8Smrg       default:
63*0a307195Smrg 	__ostr << "unknown next=" << _M_next;
644d5abbe8Smrg 	break;
654d5abbe8Smrg     }
66*0a307195Smrg     return __ostr;
674d5abbe8Smrg   }
684d5abbe8Smrg 
694d5abbe8Smrg   // Prints graphviz dot commands for state.
704d5abbe8Smrg   inline std::ostream&
_M_dot(std::ostream & __ostr,_StateIdT __id) const714d5abbe8Smrg   _State_base::_M_dot(std::ostream& __ostr, _StateIdT __id) const
724d5abbe8Smrg   {
734d5abbe8Smrg     switch (_M_opcode)
744d5abbe8Smrg     {
754d5abbe8Smrg       case _S_opcode_alternative:
764d5abbe8Smrg       case _S_opcode_repeat:
774d5abbe8Smrg 	__ostr << __id << " [label=\"" << __id << "\\nALT\"];\n"
784d5abbe8Smrg 	       << __id << " -> " << _M_next
794d5abbe8Smrg 	       << " [label=\"next\", tailport=\"s\"];\n"
804d5abbe8Smrg 	       << __id << " -> " << _M_alt
814d5abbe8Smrg 	       << " [label=\"alt\", tailport=\"n\"];\n";
824d5abbe8Smrg 	break;
834d5abbe8Smrg       case _S_opcode_backref:
844d5abbe8Smrg 	__ostr << __id << " [label=\"" << __id << "\\nBACKREF "
854d5abbe8Smrg 	       << _M_subexpr << "\"];\n"
864d5abbe8Smrg 	       << __id << " -> " << _M_next << " [label=\"<match>\"];\n";
874d5abbe8Smrg 	break;
884d5abbe8Smrg       case _S_opcode_line_begin_assertion:
894d5abbe8Smrg 	__ostr << __id << " [label=\"" << __id << "\\nLINE_BEGIN \"];\n"
904d5abbe8Smrg 	       << __id << " -> " << _M_next << " [label=\"epsilon\"];\n";
914d5abbe8Smrg 	break;
924d5abbe8Smrg       case _S_opcode_line_end_assertion:
934d5abbe8Smrg 	__ostr << __id << " [label=\"" << __id << "\\nLINE_END \"];\n"
944d5abbe8Smrg 	       << __id << " -> " << _M_next << " [label=\"epsilon\"];\n";
954d5abbe8Smrg 	break;
964d5abbe8Smrg       case _S_opcode_word_boundary:
974d5abbe8Smrg 	__ostr << __id << " [label=\"" << __id << "\\nWORD_BOUNDRY "
984d5abbe8Smrg 	       << _M_neg << "\"];\n"
994d5abbe8Smrg 	       << __id << " -> " << _M_next << " [label=\"epsilon\"];\n";
1004d5abbe8Smrg 	break;
1014d5abbe8Smrg       case _S_opcode_subexpr_lookahead:
1024d5abbe8Smrg 	__ostr << __id << " [label=\"" << __id << "\\nLOOK_AHEAD\"];\n"
1034d5abbe8Smrg 	       << __id << " -> " << _M_next
1044d5abbe8Smrg 	       << " [label=\"epsilon\", tailport=\"s\"];\n"
1054d5abbe8Smrg 	       << __id << " -> " << _M_alt
1064d5abbe8Smrg 	       << " [label=\"<assert>\", tailport=\"n\"];\n";
1074d5abbe8Smrg 	break;
1084d5abbe8Smrg       case _S_opcode_subexpr_begin:
1094d5abbe8Smrg 	__ostr << __id << " [label=\"" << __id << "\\nSBEGIN "
1104d5abbe8Smrg 	       << _M_subexpr << "\"];\n"
1114d5abbe8Smrg 	       << __id << " -> " << _M_next << " [label=\"epsilon\"];\n";
1124d5abbe8Smrg 	break;
1134d5abbe8Smrg       case _S_opcode_subexpr_end:
1144d5abbe8Smrg 	__ostr << __id << " [label=\"" << __id << "\\nSEND "
1154d5abbe8Smrg 	       << _M_subexpr << "\"];\n"
1164d5abbe8Smrg 	       << __id << " -> " << _M_next << " [label=\"epsilon\"];\n";
1174d5abbe8Smrg 	break;
1184d5abbe8Smrg       case _S_opcode_dummy:
1194d5abbe8Smrg 	break;
1204d5abbe8Smrg       case _S_opcode_match:
1214d5abbe8Smrg 	__ostr << __id << " [label=\"" << __id << "\\nMATCH\"];\n"
1224d5abbe8Smrg 	       << __id << " -> " << _M_next << " [label=\"<match>\"];\n";
1234d5abbe8Smrg 	break;
1244d5abbe8Smrg       case _S_opcode_accept:
1254d5abbe8Smrg 	__ostr << __id << " [label=\"" << __id << "\\nACC\"];\n" ;
1264d5abbe8Smrg 	break;
1274d5abbe8Smrg       default:
1284d5abbe8Smrg 	_GLIBCXX_DEBUG_ASSERT(false);
1294d5abbe8Smrg 	break;
1304d5abbe8Smrg     }
1314d5abbe8Smrg     return __ostr;
1324d5abbe8Smrg   }
1334d5abbe8Smrg 
1344d5abbe8Smrg   template<typename _TraitsT>
1354d5abbe8Smrg     std::ostream&
_M_dot(std::ostream & __ostr) const1364d5abbe8Smrg     _NFA<_TraitsT>::_M_dot(std::ostream& __ostr) const
1374d5abbe8Smrg     {
1384d5abbe8Smrg       __ostr << "digraph _Nfa {\n"
1394d5abbe8Smrg 		"  rankdir=LR;\n";
1404d5abbe8Smrg       for (size_t __i = 0; __i < this->size(); ++__i)
1414d5abbe8Smrg 	(*this)[__i]._M_dot(__ostr, __i);
1424d5abbe8Smrg       __ostr << "}\n";
1434d5abbe8Smrg       return __ostr;
1444d5abbe8Smrg     }
1454d5abbe8Smrg #endif
1464d5abbe8Smrg 
1474d5abbe8Smrg   template<typename _TraitsT>
1484d5abbe8Smrg     _StateIdT
_M_insert_backref(size_t __index)1494d5abbe8Smrg     _NFA<_TraitsT>::_M_insert_backref(size_t __index)
1504d5abbe8Smrg     {
151f9a78e0eSmrg       if (this->_M_flags & regex_constants::__polynomial)
152f9a78e0eSmrg 	__throw_regex_error(regex_constants::error_complexity,
153f9a78e0eSmrg 			    "Unexpected back-reference in polynomial mode.");
1544d5abbe8Smrg       // To figure out whether a backref is valid, a stack is used to store
1554d5abbe8Smrg       // unfinished sub-expressions. For example, when parsing
1564d5abbe8Smrg       // "(a(b)(c\\1(d)))" at '\\1', _M_subexpr_count is 3, indicating that 3
1574d5abbe8Smrg       // sub expressions are parsed or partially parsed(in the stack), aka,
1584d5abbe8Smrg       // "(a..", "(b)" and "(c..").
1594d5abbe8Smrg       // _M_paren_stack is {1, 3}, for incomplete "(a.." and "(c..". At this
1604d5abbe8Smrg       // time, "\\2" is valid, but "\\1" and "\\3" are not.
1614d5abbe8Smrg       if (__index >= _M_subexpr_count)
162f9a78e0eSmrg 	__throw_regex_error(
163f9a78e0eSmrg 	  regex_constants::error_backref,
164f9a78e0eSmrg 	  "Back-reference index exceeds current sub-expression count.");
1654d5abbe8Smrg       for (auto __it : this->_M_paren_stack)
1664d5abbe8Smrg 	if (__index == __it)
167f9a78e0eSmrg 	  __throw_regex_error(
168f9a78e0eSmrg 	    regex_constants::error_backref,
169f9a78e0eSmrg 	    "Back-reference referred to an opened sub-expression.");
1704d5abbe8Smrg       this->_M_has_backref = true;
1714d5abbe8Smrg       _StateT __tmp(_S_opcode_backref);
1724d5abbe8Smrg       __tmp._M_backref_index = __index;
1734d5abbe8Smrg       return _M_insert_state(std::move(__tmp));
1744d5abbe8Smrg     }
1754d5abbe8Smrg 
1764d5abbe8Smrg   template<typename _TraitsT>
1774d5abbe8Smrg     void
_M_eliminate_dummy()1784d5abbe8Smrg     _NFA<_TraitsT>::_M_eliminate_dummy()
1794d5abbe8Smrg     {
1804d5abbe8Smrg       for (auto& __it : *this)
1814d5abbe8Smrg 	{
182f9a78e0eSmrg 	  while (__it._M_next >= 0 && (*this)[__it._M_next]._M_opcode()
1834d5abbe8Smrg 		 == _S_opcode_dummy)
1844d5abbe8Smrg 	    __it._M_next = (*this)[__it._M_next]._M_next;
185f9a78e0eSmrg 	  if (__it._M_has_alt())
186f9a78e0eSmrg 	    while (__it._M_alt >= 0 && (*this)[__it._M_alt]._M_opcode()
1874d5abbe8Smrg 		   == _S_opcode_dummy)
1884d5abbe8Smrg 	      __it._M_alt = (*this)[__it._M_alt]._M_next;
1894d5abbe8Smrg 	}
1904d5abbe8Smrg     }
1914d5abbe8Smrg 
1924d5abbe8Smrg   // Just apply DFS on the sequence and re-link their links.
1934d5abbe8Smrg   template<typename _TraitsT>
1944d5abbe8Smrg     _StateSeq<_TraitsT>
_M_clone()1954d5abbe8Smrg     _StateSeq<_TraitsT>::_M_clone()
1964d5abbe8Smrg     {
197b1e83836Smrg       _GLIBCXX_STD_C::map<_StateIdT, _StateIdT> __m;
198b1e83836Smrg       std::stack<_StateIdT, _GLIBCXX_STD_C::deque<_StateIdT>> __stack;
1994d5abbe8Smrg       __stack.push(_M_start);
2004d5abbe8Smrg       while (!__stack.empty())
2014d5abbe8Smrg 	{
2024d5abbe8Smrg 	  auto __u = __stack.top();
2034d5abbe8Smrg 	  __stack.pop();
2044d5abbe8Smrg 	  auto __dup = _M_nfa[__u];
2054d5abbe8Smrg 	  // _M_insert_state() never return -1
206f9a78e0eSmrg 	  auto __id = _M_nfa._M_insert_state(std::move(__dup));
2074d5abbe8Smrg 	  __m[__u] = __id;
208f9a78e0eSmrg 	  if (__dup._M_has_alt())
2094d5abbe8Smrg 	    if (__dup._M_alt != _S_invalid_state_id
2104d5abbe8Smrg 		&& __m.count(__dup._M_alt) == 0)
2114d5abbe8Smrg 	      __stack.push(__dup._M_alt);
2124d5abbe8Smrg 	  if (__u == _M_end)
2134d5abbe8Smrg 	    continue;
2144d5abbe8Smrg 	  if (__dup._M_next != _S_invalid_state_id
2154d5abbe8Smrg 	      && __m.count(__dup._M_next) == 0)
2164d5abbe8Smrg 	    __stack.push(__dup._M_next);
2174d5abbe8Smrg 	}
2184d5abbe8Smrg       for (auto __it : __m)
2194d5abbe8Smrg 	{
2204d5abbe8Smrg 	  auto __v = __it.second;
2214d5abbe8Smrg 	  auto& __ref = _M_nfa[__v];
2224d5abbe8Smrg 	  if (__ref._M_next != _S_invalid_state_id)
223181254a7Smrg 	    __ref._M_next = __m.find(__ref._M_next)->second;
224181254a7Smrg 	  if (__ref._M_has_alt() && __ref._M_alt != _S_invalid_state_id)
225181254a7Smrg 	    __ref._M_alt = __m.find(__ref._M_alt)->second;
2264d5abbe8Smrg 	}
2274d5abbe8Smrg       return _StateSeq(_M_nfa, __m[_M_start], __m[_M_end]);
2284d5abbe8Smrg     }
229a3e9eb18Smrg } // namespace __detail
2304d5abbe8Smrg 
2314d5abbe8Smrg _GLIBCXX_END_NAMESPACE_VERSION
2324d5abbe8Smrg } // namespace
233