xref: /dflybsd-src/contrib/gcc-8.0/libstdc++-v3/include/bits/regex_compiler.tcc (revision 38fd149817dfbff97799f62fcb70be98c4e32523)
1*38fd1498Szrj // class template regex -*- C++ -*-
2*38fd1498Szrj 
3*38fd1498Szrj // Copyright (C) 2013-2018 Free Software Foundation, Inc.
4*38fd1498Szrj //
5*38fd1498Szrj // This file is part of the GNU ISO C++ Library.  This library is free
6*38fd1498Szrj // software; you can redistribute it and/or modify it under the
7*38fd1498Szrj // terms of the GNU General Public License as published by the
8*38fd1498Szrj // Free Software Foundation; either version 3, or (at your option)
9*38fd1498Szrj // any later version.
10*38fd1498Szrj 
11*38fd1498Szrj // This library is distributed in the hope that it will be useful,
12*38fd1498Szrj // but WITHOUT ANY WARRANTY; without even the implied warranty of
13*38fd1498Szrj // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
14*38fd1498Szrj // GNU General Public License for more details.
15*38fd1498Szrj 
16*38fd1498Szrj // Under Section 7 of GPL version 3, you are granted additional
17*38fd1498Szrj // permissions described in the GCC Runtime Library Exception, version
18*38fd1498Szrj // 3.1, as published by the Free Software Foundation.
19*38fd1498Szrj 
20*38fd1498Szrj // You should have received a copy of the GNU General Public License and
21*38fd1498Szrj // a copy of the GCC Runtime Library Exception along with this program;
22*38fd1498Szrj // see the files COPYING3 and COPYING.RUNTIME respectively.  If not, see
23*38fd1498Szrj // <http://www.gnu.org/licenses/>.
24*38fd1498Szrj 
25*38fd1498Szrj /**
26*38fd1498Szrj  *  @file bits/regex_compiler.tcc
27*38fd1498Szrj  *  This is an internal header file, included by other library headers.
28*38fd1498Szrj  *  Do not attempt to use it directly. @headername{regex}
29*38fd1498Szrj  */
30*38fd1498Szrj 
31*38fd1498Szrj // FIXME make comments doxygen format.
32*38fd1498Szrj 
33*38fd1498Szrj /*
34*38fd1498Szrj // This compiler refers to "Regular Expression Matching Can Be Simple And Fast"
35*38fd1498Szrj // (http://swtch.com/~rsc/regexp/regexp1.html),
36*38fd1498Szrj // but doesn't strictly follow it.
37*38fd1498Szrj //
38*38fd1498Szrj // When compiling, states are *chained* instead of tree- or graph-constructed.
39*38fd1498Szrj // It's more like structured programs: there's if statement and loop statement.
40*38fd1498Szrj //
41*38fd1498Szrj // For alternative structure (say "a|b"), aka "if statement", two branches
42*38fd1498Szrj // should be constructed. However, these two shall merge to an "end_tag" at
43*38fd1498Szrj // the end of this operator:
44*38fd1498Szrj //
45*38fd1498Szrj //                branch1
46*38fd1498Szrj //              /        \
47*38fd1498Szrj // => begin_tag            end_tag =>
48*38fd1498Szrj //              \        /
49*38fd1498Szrj //                branch2
50*38fd1498Szrj //
51*38fd1498Szrj // This is the difference between this implementation and that in Russ's
52*38fd1498Szrj // article.
53*38fd1498Szrj //
54*38fd1498Szrj // That's why we introduced dummy node here ------ "end_tag" is a dummy node.
55*38fd1498Szrj // All dummy nodes will be eliminated at the end of compilation.
56*38fd1498Szrj */
57*38fd1498Szrj 
58*38fd1498Szrj namespace std _GLIBCXX_VISIBILITY(default)
59*38fd1498Szrj {
60*38fd1498Szrj _GLIBCXX_BEGIN_NAMESPACE_VERSION
61*38fd1498Szrj 
62*38fd1498Szrj namespace __detail
63*38fd1498Szrj {
64*38fd1498Szrj   template<typename _TraitsT>
65*38fd1498Szrj     _Compiler<_TraitsT>::
66*38fd1498Szrj     _Compiler(_IterT __b, _IterT __e,
67*38fd1498Szrj 	      const typename _TraitsT::locale_type& __loc, _FlagT __flags)
68*38fd1498Szrj     : _M_flags((__flags
69*38fd1498Szrj 		& (regex_constants::ECMAScript
70*38fd1498Szrj 		   | regex_constants::basic
71*38fd1498Szrj 		   | regex_constants::extended
72*38fd1498Szrj 		   | regex_constants::grep
73*38fd1498Szrj 		   | regex_constants::egrep
74*38fd1498Szrj 		   | regex_constants::awk))
75*38fd1498Szrj 	       ? __flags
76*38fd1498Szrj 	       : __flags | regex_constants::ECMAScript),
77*38fd1498Szrj       _M_scanner(__b, __e, _M_flags, __loc),
78*38fd1498Szrj       _M_nfa(make_shared<_RegexT>(__loc, _M_flags)),
79*38fd1498Szrj       _M_traits(_M_nfa->_M_traits),
80*38fd1498Szrj       _M_ctype(std::use_facet<_CtypeT>(__loc))
81*38fd1498Szrj     {
82*38fd1498Szrj       _StateSeqT __r(*_M_nfa, _M_nfa->_M_start());
83*38fd1498Szrj       __r._M_append(_M_nfa->_M_insert_subexpr_begin());
84*38fd1498Szrj       this->_M_disjunction();
85*38fd1498Szrj       if (!_M_match_token(_ScannerT::_S_token_eof))
86*38fd1498Szrj 	__throw_regex_error(regex_constants::error_paren);
87*38fd1498Szrj       __r._M_append(_M_pop());
88*38fd1498Szrj       __glibcxx_assert(_M_stack.empty());
89*38fd1498Szrj       __r._M_append(_M_nfa->_M_insert_subexpr_end());
90*38fd1498Szrj       __r._M_append(_M_nfa->_M_insert_accept());
91*38fd1498Szrj       _M_nfa->_M_eliminate_dummy();
92*38fd1498Szrj     }
93*38fd1498Szrj 
94*38fd1498Szrj   template<typename _TraitsT>
95*38fd1498Szrj     void
96*38fd1498Szrj     _Compiler<_TraitsT>::
97*38fd1498Szrj     _M_disjunction()
98*38fd1498Szrj     {
99*38fd1498Szrj       this->_M_alternative();
100*38fd1498Szrj       while (_M_match_token(_ScannerT::_S_token_or))
101*38fd1498Szrj 	{
102*38fd1498Szrj 	  _StateSeqT __alt1 = _M_pop();
103*38fd1498Szrj 	  this->_M_alternative();
104*38fd1498Szrj 	  _StateSeqT __alt2 = _M_pop();
105*38fd1498Szrj 	  auto __end = _M_nfa->_M_insert_dummy();
106*38fd1498Szrj 	  __alt1._M_append(__end);
107*38fd1498Szrj 	  __alt2._M_append(__end);
108*38fd1498Szrj 	  // __alt2 is state._M_next, __alt1 is state._M_alt. The executor
109*38fd1498Szrj 	  // executes _M_alt before _M_next, as well as executing left
110*38fd1498Szrj 	  // alternative before right one.
111*38fd1498Szrj 	  _M_stack.push(_StateSeqT(*_M_nfa,
112*38fd1498Szrj 				   _M_nfa->_M_insert_alt(
113*38fd1498Szrj 				     __alt2._M_start, __alt1._M_start, false),
114*38fd1498Szrj 				   __end));
115*38fd1498Szrj 	}
116*38fd1498Szrj     }
117*38fd1498Szrj 
118*38fd1498Szrj   template<typename _TraitsT>
119*38fd1498Szrj     void
120*38fd1498Szrj     _Compiler<_TraitsT>::
121*38fd1498Szrj     _M_alternative()
122*38fd1498Szrj     {
123*38fd1498Szrj       if (this->_M_term())
124*38fd1498Szrj 	{
125*38fd1498Szrj 	  _StateSeqT __re = _M_pop();
126*38fd1498Szrj 	  this->_M_alternative();
127*38fd1498Szrj 	  __re._M_append(_M_pop());
128*38fd1498Szrj 	  _M_stack.push(__re);
129*38fd1498Szrj 	}
130*38fd1498Szrj       else
131*38fd1498Szrj 	_M_stack.push(_StateSeqT(*_M_nfa, _M_nfa->_M_insert_dummy()));
132*38fd1498Szrj     }
133*38fd1498Szrj 
134*38fd1498Szrj   template<typename _TraitsT>
135*38fd1498Szrj     bool
136*38fd1498Szrj     _Compiler<_TraitsT>::
137*38fd1498Szrj     _M_term()
138*38fd1498Szrj     {
139*38fd1498Szrj       if (this->_M_assertion())
140*38fd1498Szrj 	return true;
141*38fd1498Szrj       if (this->_M_atom())
142*38fd1498Szrj 	{
143*38fd1498Szrj 	  while (this->_M_quantifier());
144*38fd1498Szrj 	  return true;
145*38fd1498Szrj 	}
146*38fd1498Szrj       return false;
147*38fd1498Szrj     }
148*38fd1498Szrj 
149*38fd1498Szrj   template<typename _TraitsT>
150*38fd1498Szrj     bool
151*38fd1498Szrj     _Compiler<_TraitsT>::
152*38fd1498Szrj     _M_assertion()
153*38fd1498Szrj     {
154*38fd1498Szrj       if (_M_match_token(_ScannerT::_S_token_line_begin))
155*38fd1498Szrj 	_M_stack.push(_StateSeqT(*_M_nfa, _M_nfa->_M_insert_line_begin()));
156*38fd1498Szrj       else if (_M_match_token(_ScannerT::_S_token_line_end))
157*38fd1498Szrj 	_M_stack.push(_StateSeqT(*_M_nfa, _M_nfa->_M_insert_line_end()));
158*38fd1498Szrj       else if (_M_match_token(_ScannerT::_S_token_word_bound))
159*38fd1498Szrj 	// _M_value[0] == 'n' means it's negative, say "not word boundary".
160*38fd1498Szrj 	_M_stack.push(_StateSeqT(*_M_nfa, _M_nfa->
161*38fd1498Szrj 	      _M_insert_word_bound(_M_value[0] == 'n')));
162*38fd1498Szrj       else if (_M_match_token(_ScannerT::_S_token_subexpr_lookahead_begin))
163*38fd1498Szrj 	{
164*38fd1498Szrj 	  auto __neg = _M_value[0] == 'n';
165*38fd1498Szrj 	  this->_M_disjunction();
166*38fd1498Szrj 	  if (!_M_match_token(_ScannerT::_S_token_subexpr_end))
167*38fd1498Szrj 	    __throw_regex_error(regex_constants::error_paren,
168*38fd1498Szrj 				"Parenthesis is not closed.");
169*38fd1498Szrj 	  auto __tmp = _M_pop();
170*38fd1498Szrj 	  __tmp._M_append(_M_nfa->_M_insert_accept());
171*38fd1498Szrj 	  _M_stack.push(
172*38fd1498Szrj 	      _StateSeqT(
173*38fd1498Szrj 		*_M_nfa,
174*38fd1498Szrj 		_M_nfa->_M_insert_lookahead(__tmp._M_start, __neg)));
175*38fd1498Szrj 	}
176*38fd1498Szrj       else
177*38fd1498Szrj 	return false;
178*38fd1498Szrj       return true;
179*38fd1498Szrj     }
180*38fd1498Szrj 
181*38fd1498Szrj   template<typename _TraitsT>
182*38fd1498Szrj     bool
183*38fd1498Szrj     _Compiler<_TraitsT>::
184*38fd1498Szrj     _M_quantifier()
185*38fd1498Szrj     {
186*38fd1498Szrj       bool __neg = (_M_flags & regex_constants::ECMAScript);
187*38fd1498Szrj       auto __init = [this, &__neg]()
188*38fd1498Szrj 	{
189*38fd1498Szrj 	  if (_M_stack.empty())
190*38fd1498Szrj 	    __throw_regex_error(regex_constants::error_badrepeat,
191*38fd1498Szrj 				"Nothing to repeat before a quantifier.");
192*38fd1498Szrj 	  __neg = __neg && _M_match_token(_ScannerT::_S_token_opt);
193*38fd1498Szrj 	};
194*38fd1498Szrj       if (_M_match_token(_ScannerT::_S_token_closure0))
195*38fd1498Szrj 	{
196*38fd1498Szrj 	  __init();
197*38fd1498Szrj 	  auto __e = _M_pop();
198*38fd1498Szrj 	  _StateSeqT __r(*_M_nfa,
199*38fd1498Szrj 			 _M_nfa->_M_insert_repeat(_S_invalid_state_id,
200*38fd1498Szrj 						  __e._M_start, __neg));
201*38fd1498Szrj 	  __e._M_append(__r);
202*38fd1498Szrj 	  _M_stack.push(__r);
203*38fd1498Szrj 	}
204*38fd1498Szrj       else if (_M_match_token(_ScannerT::_S_token_closure1))
205*38fd1498Szrj 	{
206*38fd1498Szrj 	  __init();
207*38fd1498Szrj 	  auto __e = _M_pop();
208*38fd1498Szrj 	  __e._M_append(_M_nfa->_M_insert_repeat(_S_invalid_state_id,
209*38fd1498Szrj 						 __e._M_start, __neg));
210*38fd1498Szrj 	  _M_stack.push(__e);
211*38fd1498Szrj 	}
212*38fd1498Szrj       else if (_M_match_token(_ScannerT::_S_token_opt))
213*38fd1498Szrj 	{
214*38fd1498Szrj 	  __init();
215*38fd1498Szrj 	  auto __e = _M_pop();
216*38fd1498Szrj 	  auto __end = _M_nfa->_M_insert_dummy();
217*38fd1498Szrj 	  _StateSeqT __r(*_M_nfa,
218*38fd1498Szrj 			 _M_nfa->_M_insert_repeat(_S_invalid_state_id,
219*38fd1498Szrj 						  __e._M_start, __neg));
220*38fd1498Szrj 	  __e._M_append(__end);
221*38fd1498Szrj 	  __r._M_append(__end);
222*38fd1498Szrj 	  _M_stack.push(__r);
223*38fd1498Szrj 	}
224*38fd1498Szrj       else if (_M_match_token(_ScannerT::_S_token_interval_begin))
225*38fd1498Szrj 	{
226*38fd1498Szrj 	  if (_M_stack.empty())
227*38fd1498Szrj 	    __throw_regex_error(regex_constants::error_badrepeat,
228*38fd1498Szrj 				"Nothing to repeat before a quantifier.");
229*38fd1498Szrj 	  if (!_M_match_token(_ScannerT::_S_token_dup_count))
230*38fd1498Szrj 	    __throw_regex_error(regex_constants::error_badbrace,
231*38fd1498Szrj 				"Unexpected token in brace expression.");
232*38fd1498Szrj 	  _StateSeqT __r(_M_pop());
233*38fd1498Szrj 	  _StateSeqT __e(*_M_nfa, _M_nfa->_M_insert_dummy());
234*38fd1498Szrj 	  long __min_rep = _M_cur_int_value(10);
235*38fd1498Szrj 	  bool __infi = false;
236*38fd1498Szrj 	  long __n;
237*38fd1498Szrj 
238*38fd1498Szrj 	  // {3
239*38fd1498Szrj 	  if (_M_match_token(_ScannerT::_S_token_comma))
240*38fd1498Szrj 	    if (_M_match_token(_ScannerT::_S_token_dup_count)) // {3,7}
241*38fd1498Szrj 	      __n = _M_cur_int_value(10) - __min_rep;
242*38fd1498Szrj 	    else
243*38fd1498Szrj 	      __infi = true;
244*38fd1498Szrj 	  else
245*38fd1498Szrj 	    __n = 0;
246*38fd1498Szrj 	  if (!_M_match_token(_ScannerT::_S_token_interval_end))
247*38fd1498Szrj 	    __throw_regex_error(regex_constants::error_brace,
248*38fd1498Szrj 				"Unexpected end of brace expression.");
249*38fd1498Szrj 
250*38fd1498Szrj 	  __neg = __neg && _M_match_token(_ScannerT::_S_token_opt);
251*38fd1498Szrj 
252*38fd1498Szrj 	  for (long __i = 0; __i < __min_rep; ++__i)
253*38fd1498Szrj 	    __e._M_append(__r._M_clone());
254*38fd1498Szrj 
255*38fd1498Szrj 	  if (__infi)
256*38fd1498Szrj 	    {
257*38fd1498Szrj 	      auto __tmp = __r._M_clone();
258*38fd1498Szrj 	      _StateSeqT __s(*_M_nfa,
259*38fd1498Szrj 			     _M_nfa->_M_insert_repeat(_S_invalid_state_id,
260*38fd1498Szrj 						      __tmp._M_start, __neg));
261*38fd1498Szrj 	      __tmp._M_append(__s);
262*38fd1498Szrj 	      __e._M_append(__s);
263*38fd1498Szrj 	    }
264*38fd1498Szrj 	  else
265*38fd1498Szrj 	    {
266*38fd1498Szrj 	      if (__n < 0)
267*38fd1498Szrj 		__throw_regex_error(regex_constants::error_badbrace,
268*38fd1498Szrj 				    "Invalid range in brace expression.");
269*38fd1498Szrj 	      auto __end = _M_nfa->_M_insert_dummy();
270*38fd1498Szrj 	      // _M_alt is the "match more" branch, and _M_next is the
271*38fd1498Szrj 	      // "match less" one. Switch _M_alt and _M_next of all created
272*38fd1498Szrj 	      // nodes. This is a hack but IMO works well.
273*38fd1498Szrj 	      std::stack<_StateIdT> __stack;
274*38fd1498Szrj 	      for (long __i = 0; __i < __n; ++__i)
275*38fd1498Szrj 		{
276*38fd1498Szrj 		  auto __tmp = __r._M_clone();
277*38fd1498Szrj 		  auto __alt = _M_nfa->_M_insert_repeat(__tmp._M_start,
278*38fd1498Szrj 							__end, __neg);
279*38fd1498Szrj 		  __stack.push(__alt);
280*38fd1498Szrj 		  __e._M_append(_StateSeqT(*_M_nfa, __alt, __tmp._M_end));
281*38fd1498Szrj 		}
282*38fd1498Szrj 	      __e._M_append(__end);
283*38fd1498Szrj 	      while (!__stack.empty())
284*38fd1498Szrj 		{
285*38fd1498Szrj 		  auto& __tmp = (*_M_nfa)[__stack.top()];
286*38fd1498Szrj 		  __stack.pop();
287*38fd1498Szrj 		  std::swap(__tmp._M_next, __tmp._M_alt);
288*38fd1498Szrj 		}
289*38fd1498Szrj 	    }
290*38fd1498Szrj 	  _M_stack.push(__e);
291*38fd1498Szrj 	}
292*38fd1498Szrj       else
293*38fd1498Szrj 	return false;
294*38fd1498Szrj       return true;
295*38fd1498Szrj     }
296*38fd1498Szrj 
297*38fd1498Szrj #define __INSERT_REGEX_MATCHER(__func, ...)\
298*38fd1498Szrj 	do\
299*38fd1498Szrj 	  if (!(_M_flags & regex_constants::icase))\
300*38fd1498Szrj 	    if (!(_M_flags & regex_constants::collate))\
301*38fd1498Szrj 	      __func<false, false>(__VA_ARGS__);\
302*38fd1498Szrj 	    else\
303*38fd1498Szrj 	      __func<false, true>(__VA_ARGS__);\
304*38fd1498Szrj 	  else\
305*38fd1498Szrj 	    if (!(_M_flags & regex_constants::collate))\
306*38fd1498Szrj 	      __func<true, false>(__VA_ARGS__);\
307*38fd1498Szrj 	    else\
308*38fd1498Szrj 	      __func<true, true>(__VA_ARGS__);\
309*38fd1498Szrj 	while (false)
310*38fd1498Szrj 
311*38fd1498Szrj   template<typename _TraitsT>
312*38fd1498Szrj     bool
313*38fd1498Szrj     _Compiler<_TraitsT>::
314*38fd1498Szrj     _M_atom()
315*38fd1498Szrj     {
316*38fd1498Szrj       if (_M_match_token(_ScannerT::_S_token_anychar))
317*38fd1498Szrj 	{
318*38fd1498Szrj 	  if (!(_M_flags & regex_constants::ECMAScript))
319*38fd1498Szrj 	    __INSERT_REGEX_MATCHER(_M_insert_any_matcher_posix);
320*38fd1498Szrj 	  else
321*38fd1498Szrj 	    __INSERT_REGEX_MATCHER(_M_insert_any_matcher_ecma);
322*38fd1498Szrj 	}
323*38fd1498Szrj       else if (_M_try_char())
324*38fd1498Szrj 	__INSERT_REGEX_MATCHER(_M_insert_char_matcher);
325*38fd1498Szrj       else if (_M_match_token(_ScannerT::_S_token_backref))
326*38fd1498Szrj 	_M_stack.push(_StateSeqT(*_M_nfa, _M_nfa->
327*38fd1498Szrj 				 _M_insert_backref(_M_cur_int_value(10))));
328*38fd1498Szrj       else if (_M_match_token(_ScannerT::_S_token_quoted_class))
329*38fd1498Szrj 	__INSERT_REGEX_MATCHER(_M_insert_character_class_matcher);
330*38fd1498Szrj       else if (_M_match_token(_ScannerT::_S_token_subexpr_no_group_begin))
331*38fd1498Szrj 	{
332*38fd1498Szrj 	  _StateSeqT __r(*_M_nfa, _M_nfa->_M_insert_dummy());
333*38fd1498Szrj 	  this->_M_disjunction();
334*38fd1498Szrj 	  if (!_M_match_token(_ScannerT::_S_token_subexpr_end))
335*38fd1498Szrj 	    __throw_regex_error(regex_constants::error_paren,
336*38fd1498Szrj 				"Parenthesis is not closed.");
337*38fd1498Szrj 	  __r._M_append(_M_pop());
338*38fd1498Szrj 	  _M_stack.push(__r);
339*38fd1498Szrj 	}
340*38fd1498Szrj       else if (_M_match_token(_ScannerT::_S_token_subexpr_begin))
341*38fd1498Szrj 	{
342*38fd1498Szrj 	  _StateSeqT __r(*_M_nfa, _M_nfa->_M_insert_subexpr_begin());
343*38fd1498Szrj 	  this->_M_disjunction();
344*38fd1498Szrj 	  if (!_M_match_token(_ScannerT::_S_token_subexpr_end))
345*38fd1498Szrj 	    __throw_regex_error(regex_constants::error_paren,
346*38fd1498Szrj 				"Parenthesis is not closed.");
347*38fd1498Szrj 	  __r._M_append(_M_pop());
348*38fd1498Szrj 	  __r._M_append(_M_nfa->_M_insert_subexpr_end());
349*38fd1498Szrj 	  _M_stack.push(__r);
350*38fd1498Szrj 	}
351*38fd1498Szrj       else if (!_M_bracket_expression())
352*38fd1498Szrj 	return false;
353*38fd1498Szrj       return true;
354*38fd1498Szrj     }
355*38fd1498Szrj 
356*38fd1498Szrj   template<typename _TraitsT>
357*38fd1498Szrj     bool
358*38fd1498Szrj     _Compiler<_TraitsT>::
359*38fd1498Szrj     _M_bracket_expression()
360*38fd1498Szrj     {
361*38fd1498Szrj       bool __neg =
362*38fd1498Szrj 	_M_match_token(_ScannerT::_S_token_bracket_neg_begin);
363*38fd1498Szrj       if (!(__neg || _M_match_token(_ScannerT::_S_token_bracket_begin)))
364*38fd1498Szrj 	return false;
365*38fd1498Szrj       __INSERT_REGEX_MATCHER(_M_insert_bracket_matcher, __neg);
366*38fd1498Szrj       return true;
367*38fd1498Szrj     }
368*38fd1498Szrj #undef __INSERT_REGEX_MATCHER
369*38fd1498Szrj 
370*38fd1498Szrj   template<typename _TraitsT>
371*38fd1498Szrj   template<bool __icase, bool __collate>
372*38fd1498Szrj     void
373*38fd1498Szrj     _Compiler<_TraitsT>::
374*38fd1498Szrj     _M_insert_any_matcher_ecma()
375*38fd1498Szrj     {
376*38fd1498Szrj       _M_stack.push(_StateSeqT(*_M_nfa,
377*38fd1498Szrj 	_M_nfa->_M_insert_matcher
378*38fd1498Szrj 	  (_AnyMatcher<_TraitsT, true, __icase, __collate>
379*38fd1498Szrj 	    (_M_traits))));
380*38fd1498Szrj     }
381*38fd1498Szrj 
382*38fd1498Szrj   template<typename _TraitsT>
383*38fd1498Szrj   template<bool __icase, bool __collate>
384*38fd1498Szrj     void
385*38fd1498Szrj     _Compiler<_TraitsT>::
386*38fd1498Szrj     _M_insert_any_matcher_posix()
387*38fd1498Szrj     {
388*38fd1498Szrj       _M_stack.push(_StateSeqT(*_M_nfa,
389*38fd1498Szrj 	_M_nfa->_M_insert_matcher
390*38fd1498Szrj 	  (_AnyMatcher<_TraitsT, false, __icase, __collate>
391*38fd1498Szrj 	    (_M_traits))));
392*38fd1498Szrj     }
393*38fd1498Szrj 
394*38fd1498Szrj   template<typename _TraitsT>
395*38fd1498Szrj   template<bool __icase, bool __collate>
396*38fd1498Szrj     void
397*38fd1498Szrj     _Compiler<_TraitsT>::
398*38fd1498Szrj     _M_insert_char_matcher()
399*38fd1498Szrj     {
400*38fd1498Szrj       _M_stack.push(_StateSeqT(*_M_nfa,
401*38fd1498Szrj 	_M_nfa->_M_insert_matcher
402*38fd1498Szrj 	  (_CharMatcher<_TraitsT, __icase, __collate>
403*38fd1498Szrj 	    (_M_value[0], _M_traits))));
404*38fd1498Szrj     }
405*38fd1498Szrj 
406*38fd1498Szrj   template<typename _TraitsT>
407*38fd1498Szrj   template<bool __icase, bool __collate>
408*38fd1498Szrj     void
409*38fd1498Szrj     _Compiler<_TraitsT>::
410*38fd1498Szrj     _M_insert_character_class_matcher()
411*38fd1498Szrj     {
412*38fd1498Szrj       __glibcxx_assert(_M_value.size() == 1);
413*38fd1498Szrj       _BracketMatcher<_TraitsT, __icase, __collate> __matcher
414*38fd1498Szrj 	(_M_ctype.is(_CtypeT::upper, _M_value[0]), _M_traits);
415*38fd1498Szrj       __matcher._M_add_character_class(_M_value, false);
416*38fd1498Szrj       __matcher._M_ready();
417*38fd1498Szrj       _M_stack.push(_StateSeqT(*_M_nfa,
418*38fd1498Szrj 	_M_nfa->_M_insert_matcher(std::move(__matcher))));
419*38fd1498Szrj     }
420*38fd1498Szrj 
421*38fd1498Szrj   template<typename _TraitsT>
422*38fd1498Szrj   template<bool __icase, bool __collate>
423*38fd1498Szrj     void
424*38fd1498Szrj     _Compiler<_TraitsT>::
425*38fd1498Szrj     _M_insert_bracket_matcher(bool __neg)
426*38fd1498Szrj     {
427*38fd1498Szrj       _BracketMatcher<_TraitsT, __icase, __collate> __matcher(__neg, _M_traits);
428*38fd1498Szrj       pair<bool, _CharT> __last_char; // Optional<_CharT>
429*38fd1498Szrj       __last_char.first = false;
430*38fd1498Szrj       if (!(_M_flags & regex_constants::ECMAScript))
431*38fd1498Szrj 	{
432*38fd1498Szrj 	  if (_M_try_char())
433*38fd1498Szrj 	    {
434*38fd1498Szrj 	      __last_char.first = true;
435*38fd1498Szrj 	      __last_char.second = _M_value[0];
436*38fd1498Szrj 	    }
437*38fd1498Szrj 	  else if (_M_match_token(_ScannerT::_S_token_bracket_dash))
438*38fd1498Szrj 	    {
439*38fd1498Szrj 	      __last_char.first = true;
440*38fd1498Szrj 	      __last_char.second = '-';
441*38fd1498Szrj 	    }
442*38fd1498Szrj 	}
443*38fd1498Szrj       while (_M_expression_term(__last_char, __matcher));
444*38fd1498Szrj       if (__last_char.first)
445*38fd1498Szrj 	__matcher._M_add_char(__last_char.second);
446*38fd1498Szrj       __matcher._M_ready();
447*38fd1498Szrj       _M_stack.push(_StateSeqT(
448*38fd1498Szrj 		      *_M_nfa,
449*38fd1498Szrj 		      _M_nfa->_M_insert_matcher(std::move(__matcher))));
450*38fd1498Szrj     }
451*38fd1498Szrj 
452*38fd1498Szrj   template<typename _TraitsT>
453*38fd1498Szrj   template<bool __icase, bool __collate>
454*38fd1498Szrj     bool
455*38fd1498Szrj     _Compiler<_TraitsT>::
456*38fd1498Szrj     _M_expression_term(pair<bool, _CharT>& __last_char,
457*38fd1498Szrj 		       _BracketMatcher<_TraitsT, __icase, __collate>& __matcher)
458*38fd1498Szrj     {
459*38fd1498Szrj       if (_M_match_token(_ScannerT::_S_token_bracket_end))
460*38fd1498Szrj 	return false;
461*38fd1498Szrj 
462*38fd1498Szrj       const auto __push_char = [&](_CharT __ch)
463*38fd1498Szrj       {
464*38fd1498Szrj 	if (__last_char.first)
465*38fd1498Szrj 	  __matcher._M_add_char(__last_char.second);
466*38fd1498Szrj 	else
467*38fd1498Szrj 	  __last_char.first = true;
468*38fd1498Szrj 	__last_char.second = __ch;
469*38fd1498Szrj       };
470*38fd1498Szrj       const auto __flush = [&]
471*38fd1498Szrj       {
472*38fd1498Szrj 	if (__last_char.first)
473*38fd1498Szrj 	  {
474*38fd1498Szrj 	    __matcher._M_add_char(__last_char.second);
475*38fd1498Szrj 	    __last_char.first = false;
476*38fd1498Szrj 	  }
477*38fd1498Szrj       };
478*38fd1498Szrj 
479*38fd1498Szrj       if (_M_match_token(_ScannerT::_S_token_collsymbol))
480*38fd1498Szrj 	{
481*38fd1498Szrj 	  auto __symbol = __matcher._M_add_collate_element(_M_value);
482*38fd1498Szrj 	  if (__symbol.size() == 1)
483*38fd1498Szrj 	    __push_char(__symbol[0]);
484*38fd1498Szrj 	  else
485*38fd1498Szrj 	    __flush();
486*38fd1498Szrj 	}
487*38fd1498Szrj       else if (_M_match_token(_ScannerT::_S_token_equiv_class_name))
488*38fd1498Szrj 	{
489*38fd1498Szrj 	  __flush();
490*38fd1498Szrj 	  __matcher._M_add_equivalence_class(_M_value);
491*38fd1498Szrj 	}
492*38fd1498Szrj       else if (_M_match_token(_ScannerT::_S_token_char_class_name))
493*38fd1498Szrj 	{
494*38fd1498Szrj 	  __flush();
495*38fd1498Szrj 	  __matcher._M_add_character_class(_M_value, false);
496*38fd1498Szrj 	}
497*38fd1498Szrj       else if (_M_try_char())
498*38fd1498Szrj 	__push_char(_M_value[0]);
499*38fd1498Szrj       // POSIX doesn't allow '-' as a start-range char (say [a-z--0]),
500*38fd1498Szrj       // except when the '-' is the first or last character in the bracket
501*38fd1498Szrj       // expression ([--0]). ECMAScript treats all '-' after a range as a
502*38fd1498Szrj       // normal character. Also see above, where _M_expression_term gets called.
503*38fd1498Szrj       //
504*38fd1498Szrj       // As a result, POSIX rejects [-----], but ECMAScript doesn't.
505*38fd1498Szrj       // Boost (1.57.0) always uses POSIX style even in its ECMAScript syntax.
506*38fd1498Szrj       // Clang (3.5) always uses ECMAScript style even in its POSIX syntax.
507*38fd1498Szrj       //
508*38fd1498Szrj       // It turns out that no one reads BNFs ;)
509*38fd1498Szrj       else if (_M_match_token(_ScannerT::_S_token_bracket_dash))
510*38fd1498Szrj 	{
511*38fd1498Szrj 	  if (!__last_char.first)
512*38fd1498Szrj 	    {
513*38fd1498Szrj 	      if (!(_M_flags & regex_constants::ECMAScript))
514*38fd1498Szrj 		{
515*38fd1498Szrj 		  if (_M_match_token(_ScannerT::_S_token_bracket_end))
516*38fd1498Szrj 		    {
517*38fd1498Szrj 		      __push_char('-');
518*38fd1498Szrj 		      return false;
519*38fd1498Szrj 		    }
520*38fd1498Szrj 		  __throw_regex_error(
521*38fd1498Szrj 		    regex_constants::error_range,
522*38fd1498Szrj 		    "Unexpected dash in bracket expression. For POSIX syntax, "
523*38fd1498Szrj 		    "a dash is not treated literally only when it is at "
524*38fd1498Szrj 		    "beginning or end.");
525*38fd1498Szrj 		}
526*38fd1498Szrj 	      __push_char('-');
527*38fd1498Szrj 	    }
528*38fd1498Szrj 	  else
529*38fd1498Szrj 	    {
530*38fd1498Szrj 	      if (_M_try_char())
531*38fd1498Szrj 		{
532*38fd1498Szrj 		  __matcher._M_make_range(__last_char.second, _M_value[0]);
533*38fd1498Szrj 		  __last_char.first = false;
534*38fd1498Szrj 		}
535*38fd1498Szrj 	      else if (_M_match_token(_ScannerT::_S_token_bracket_dash))
536*38fd1498Szrj 		{
537*38fd1498Szrj 		  __matcher._M_make_range(__last_char.second, '-');
538*38fd1498Szrj 		  __last_char.first = false;
539*38fd1498Szrj 		}
540*38fd1498Szrj 	      else
541*38fd1498Szrj 		{
542*38fd1498Szrj 		  if (_M_scanner._M_get_token()
543*38fd1498Szrj 		      != _ScannerT::_S_token_bracket_end)
544*38fd1498Szrj 		    __throw_regex_error(
545*38fd1498Szrj 		      regex_constants::error_range,
546*38fd1498Szrj 		      "Character is expected after a dash.");
547*38fd1498Szrj 		  __push_char('-');
548*38fd1498Szrj 		}
549*38fd1498Szrj 	    }
550*38fd1498Szrj 	}
551*38fd1498Szrj       else if (_M_match_token(_ScannerT::_S_token_quoted_class))
552*38fd1498Szrj 	{
553*38fd1498Szrj 	  __flush();
554*38fd1498Szrj 	  __matcher._M_add_character_class(_M_value,
555*38fd1498Szrj 					   _M_ctype.is(_CtypeT::upper,
556*38fd1498Szrj 						       _M_value[0]));
557*38fd1498Szrj 	}
558*38fd1498Szrj       else
559*38fd1498Szrj 	__throw_regex_error(regex_constants::error_brack,
560*38fd1498Szrj 			    "Unexpected character in bracket expression.");
561*38fd1498Szrj 
562*38fd1498Szrj       return true;
563*38fd1498Szrj     }
564*38fd1498Szrj 
565*38fd1498Szrj   template<typename _TraitsT>
566*38fd1498Szrj     bool
567*38fd1498Szrj     _Compiler<_TraitsT>::
568*38fd1498Szrj     _M_try_char()
569*38fd1498Szrj     {
570*38fd1498Szrj       bool __is_char = false;
571*38fd1498Szrj       if (_M_match_token(_ScannerT::_S_token_oct_num))
572*38fd1498Szrj 	{
573*38fd1498Szrj 	  __is_char = true;
574*38fd1498Szrj 	  _M_value.assign(1, _M_cur_int_value(8));
575*38fd1498Szrj 	}
576*38fd1498Szrj       else if (_M_match_token(_ScannerT::_S_token_hex_num))
577*38fd1498Szrj 	{
578*38fd1498Szrj 	  __is_char = true;
579*38fd1498Szrj 	  _M_value.assign(1, _M_cur_int_value(16));
580*38fd1498Szrj 	}
581*38fd1498Szrj       else if (_M_match_token(_ScannerT::_S_token_ord_char))
582*38fd1498Szrj 	__is_char = true;
583*38fd1498Szrj       return __is_char;
584*38fd1498Szrj     }
585*38fd1498Szrj 
586*38fd1498Szrj   template<typename _TraitsT>
587*38fd1498Szrj     bool
588*38fd1498Szrj     _Compiler<_TraitsT>::
589*38fd1498Szrj     _M_match_token(_TokenT token)
590*38fd1498Szrj     {
591*38fd1498Szrj       if (token == _M_scanner._M_get_token())
592*38fd1498Szrj 	{
593*38fd1498Szrj 	  _M_value = _M_scanner._M_get_value();
594*38fd1498Szrj 	  _M_scanner._M_advance();
595*38fd1498Szrj 	  return true;
596*38fd1498Szrj 	}
597*38fd1498Szrj       return false;
598*38fd1498Szrj     }
599*38fd1498Szrj 
600*38fd1498Szrj   template<typename _TraitsT>
601*38fd1498Szrj     int
602*38fd1498Szrj     _Compiler<_TraitsT>::
603*38fd1498Szrj     _M_cur_int_value(int __radix)
604*38fd1498Szrj     {
605*38fd1498Szrj       long __v = 0;
606*38fd1498Szrj       for (typename _StringT::size_type __i = 0;
607*38fd1498Szrj 	   __i < _M_value.length(); ++__i)
608*38fd1498Szrj 	__v =__v * __radix + _M_traits.value(_M_value[__i], __radix);
609*38fd1498Szrj       return __v;
610*38fd1498Szrj     }
611*38fd1498Szrj 
612*38fd1498Szrj   template<typename _TraitsT, bool __icase, bool __collate>
613*38fd1498Szrj     bool
614*38fd1498Szrj     _BracketMatcher<_TraitsT, __icase, __collate>::
615*38fd1498Szrj     _M_apply(_CharT __ch, false_type) const
616*38fd1498Szrj     {
617*38fd1498Szrj       return [this, __ch]
618*38fd1498Szrj       {
619*38fd1498Szrj 	if (std::binary_search(_M_char_set.begin(), _M_char_set.end(),
620*38fd1498Szrj 			       _M_translator._M_translate(__ch)))
621*38fd1498Szrj 	  return true;
622*38fd1498Szrj 	auto __s = _M_translator._M_transform(__ch);
623*38fd1498Szrj 	for (auto& __it : _M_range_set)
624*38fd1498Szrj 	  if (_M_translator._M_match_range(__it.first, __it.second, __s))
625*38fd1498Szrj 	    return true;
626*38fd1498Szrj 	if (_M_traits.isctype(__ch, _M_class_set))
627*38fd1498Szrj 	  return true;
628*38fd1498Szrj 	if (std::find(_M_equiv_set.begin(), _M_equiv_set.end(),
629*38fd1498Szrj 		      _M_traits.transform_primary(&__ch, &__ch+1))
630*38fd1498Szrj 	    != _M_equiv_set.end())
631*38fd1498Szrj 	  return true;
632*38fd1498Szrj 	for (auto& __it : _M_neg_class_set)
633*38fd1498Szrj 	  if (!_M_traits.isctype(__ch, __it))
634*38fd1498Szrj 	    return true;
635*38fd1498Szrj 	return false;
636*38fd1498Szrj       }() ^ _M_is_non_matching;
637*38fd1498Szrj     }
638*38fd1498Szrj } // namespace __detail
639*38fd1498Szrj 
640*38fd1498Szrj _GLIBCXX_END_NAMESPACE_VERSION
641*38fd1498Szrj } // namespace
642