xref: /netbsd-src/external/gpl3/gcc/dist/libstdc++-v3/include/bits/regex_compiler.h (revision b1e838363e3c6fc78a55519254d99869742dd33c)
1 // class template regex -*- C++ -*-
2 
3 // Copyright (C) 2010-2022 Free Software Foundation, Inc.
4 //
5 // This file is part of the GNU ISO C++ Library.  This library is free
6 // software; you can redistribute it and/or modify it under the
7 // terms of the GNU General Public License as published by the
8 // Free Software Foundation; either version 3, or (at your option)
9 // any later version.
10 
11 // This library is distributed in the hope that it will be useful,
12 // but WITHOUT ANY WARRANTY; without even the implied warranty of
13 // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
14 // GNU General Public License for more details.
15 
16 // Under Section 7 of GPL version 3, you are granted additional
17 // permissions described in the GCC Runtime Library Exception, version
18 // 3.1, as published by the Free Software Foundation.
19 
20 // You should have received a copy of the GNU General Public License and
21 // a copy of the GCC Runtime Library Exception along with this program;
22 // see the files COPYING3 and COPYING.RUNTIME respectively.  If not, see
23 // <http://www.gnu.org/licenses/>.
24 
25 /**
26  *  @file bits/regex_compiler.h
27  *  This is an internal header file, included by other library headers.
28  *  Do not attempt to use it directly. @headername{regex}
29  */
30 
_GLIBCXX_VISIBILITY(default)31 namespace std _GLIBCXX_VISIBILITY(default)
32 {
33 _GLIBCXX_BEGIN_NAMESPACE_VERSION
34 _GLIBCXX_BEGIN_NAMESPACE_CXX11
35 
36   template<typename>
37     class regex_traits;
38 
39 _GLIBCXX_END_NAMESPACE_CXX11
40 
41 namespace __detail
42 {
43   /**
44    * @addtogroup regex-detail
45    * @{
46    */
47 
48   template<typename, bool, bool>
49     struct _BracketMatcher;
50 
51   /**
52    * @brief Builds an NFA from an input iterator range.
53    *
54    * The %_TraitsT type should fulfill requirements [28.3].
55    */
56   template<typename _TraitsT>
57     class _Compiler
58     {
59     public:
60       typedef typename _TraitsT::char_type        _CharT;
61       typedef _NFA<_TraitsT>              	  _RegexT;
62       typedef regex_constants::syntax_option_type _FlagT;
63 
64       _Compiler(const _CharT* __b, const _CharT* __e,
65 		const typename _TraitsT::locale_type& __traits, _FlagT __flags);
66 
67       shared_ptr<const _RegexT>
68       _M_get_nfa() noexcept
69       { return std::move(_M_nfa); }
70 
71     private:
72       typedef _Scanner<_CharT>               _ScannerT;
73       typedef typename _TraitsT::string_type _StringT;
74       typedef typename _ScannerT::_TokenT    _TokenT;
75       typedef _StateSeq<_TraitsT>            _StateSeqT;
76       typedef std::stack<_StateSeqT>         _StackT;
77       typedef std::ctype<_CharT>             _CtypeT;
78 
79       // accepts a specific token or returns false.
80       bool
81       _M_match_token(_TokenT __token);
82 
83       void
84       _M_disjunction();
85 
86       void
87       _M_alternative();
88 
89       bool
90       _M_term();
91 
92       bool
93       _M_assertion();
94 
95       bool
96       _M_quantifier();
97 
98       bool
99       _M_atom();
100 
101       bool
102       _M_bracket_expression();
103 
104       template<bool __icase, bool __collate>
105 	void
106 	_M_insert_any_matcher_ecma();
107 
108       template<bool __icase, bool __collate>
109 	void
110 	_M_insert_any_matcher_posix();
111 
112       template<bool __icase, bool __collate>
113 	void
114 	_M_insert_char_matcher();
115 
116       template<bool __icase, bool __collate>
117 	void
118 	_M_insert_character_class_matcher();
119 
120       template<bool __icase, bool __collate>
121 	void
122 	_M_insert_bracket_matcher(bool __neg);
123 
124       // Cache of the last atom seen in a bracketed range expression.
125       struct _BracketState
126       {
127 	enum class _Type : char { _None, _Char, _Class } _M_type = _Type::_None;
128 	_CharT _M_char = _CharT();
129 
130 	void
131 	set(_CharT __c) noexcept { _M_type = _Type::_Char; _M_char = __c; }
132 
133 	_GLIBCXX_NODISCARD _CharT
134 	get() const noexcept { return _M_char; }
135 
136 	void
137 	reset(_Type __t = _Type::_None) noexcept { _M_type = __t; }
138 
139 	explicit operator bool() const noexcept
140 	{ return _M_type != _Type::_None; }
141 
142 	// Previous token was a single character.
143 	_GLIBCXX_NODISCARD bool
144 	_M_is_char() const noexcept { return _M_type == _Type::_Char; }
145 
146 	// Previous token was a character class, equivalent class,
147 	// collating symbol etc.
148 	_GLIBCXX_NODISCARD bool
149 	_M_is_class() const noexcept { return _M_type == _Type::_Class; }
150       };
151 
152       template<bool __icase, bool __collate>
153 	using _BracketMatcher
154 	  = std::__detail::_BracketMatcher<_TraitsT, __icase, __collate>;
155 
156       // Returns true if successfully parsed one term and should continue
157       // compiling a bracket expression.
158       // Returns false if the compiler should move on.
159       template<bool __icase, bool __collate>
160 	bool
161 	_M_expression_term(_BracketState& __last_char,
162 			   _BracketMatcher<__icase, __collate>& __matcher);
163 
164       int
165       _M_cur_int_value(int __radix);
166 
167       bool
168       _M_try_char();
169 
170       _StateSeqT
171       _M_pop()
172       {
173 	auto ret = _M_stack.top();
174 	_M_stack.pop();
175 	return ret;
176       }
177 
178       static _FlagT
179       _S_validate(_FlagT __f)
180       {
181 	using namespace regex_constants;
182 	switch (__f & (ECMAScript|basic|extended|awk|grep|egrep))
183 	  {
184 	  case ECMAScript:
185 	  case basic:
186 	  case extended:
187 	  case awk:
188 	  case grep:
189 	  case egrep:
190 	    return __f;
191 	  case _FlagT(0):
192 	    return __f | ECMAScript;
193 	  default:
194 	    std::__throw_regex_error(_S_grammar, "conflicting grammar options");
195 	  }
196       }
197 
198       _FlagT              _M_flags;
199       _ScannerT           _M_scanner;
200       shared_ptr<_RegexT> _M_nfa;
201       _StringT            _M_value;
202       _StackT             _M_stack;
203       const _TraitsT&     _M_traits;
204       const _CtypeT&      _M_ctype;
205     };
206 
207   // [28.13.14]
208   template<typename _TraitsT, bool __icase, bool __collate>
209     class _RegexTranslatorBase
210     {
211     public:
212       typedef typename _TraitsT::char_type	      _CharT;
213       typedef typename _TraitsT::string_type	      _StringT;
214       typedef _StringT _StrTransT;
215 
216       explicit
217       _RegexTranslatorBase(const _TraitsT& __traits)
218       : _M_traits(__traits)
219       { }
220 
221       _CharT
222       _M_translate(_CharT __ch) const
223       {
224 	if _GLIBCXX17_CONSTEXPR (__icase)
225 	  return _M_traits.translate_nocase(__ch);
226 	else if _GLIBCXX17_CONSTEXPR (__collate)
227 	  return _M_traits.translate(__ch);
228 	else
229 	  return __ch;
230       }
231 
232       _StrTransT
233       _M_transform(_CharT __ch) const
234       {
235 	_StrTransT __str(1, __ch);
236 	return _M_traits.transform(__str.begin(), __str.end());
237       }
238 
239       // See LWG 523. It's not efficiently implementable when _TraitsT is not
240       // std::regex_traits<>, and __collate is true. See specializations for
241       // implementations of other cases.
242       bool
243       _M_match_range(const _StrTransT& __first, const _StrTransT& __last,
244 		     const _StrTransT& __s) const
245       { return __first <= __s && __s <= __last; }
246 
247     protected:
248       bool _M_in_range_icase(_CharT __first, _CharT __last, _CharT __ch) const
249       {
250 	typedef std::ctype<_CharT> __ctype_type;
251 	const auto& __fctyp = use_facet<__ctype_type>(this->_M_traits.getloc());
252 	auto __lower = __fctyp.tolower(__ch);
253 	auto __upper = __fctyp.toupper(__ch);
254 	return (__first <= __lower && __lower <= __last)
255 	  || (__first <= __upper && __upper <= __last);
256       }
257 
258       const _TraitsT& _M_traits;
259     };
260 
261   template<typename _TraitsT, bool __icase, bool __collate>
262     class _RegexTranslator
263     : public _RegexTranslatorBase<_TraitsT, __icase, __collate>
264     {
265     public:
266       typedef _RegexTranslatorBase<_TraitsT, __icase, __collate> _Base;
267       using _Base::_Base;
268     };
269 
270   template<typename _TraitsT, bool __icase>
271     class _RegexTranslator<_TraitsT, __icase, false>
272     : public _RegexTranslatorBase<_TraitsT, __icase, false>
273     {
274     public:
275       typedef _RegexTranslatorBase<_TraitsT, __icase, false> _Base;
276       typedef typename _Base::_CharT _CharT;
277       typedef _CharT _StrTransT;
278 
279       using _Base::_Base;
280 
281       _StrTransT
282       _M_transform(_CharT __ch) const
283       { return __ch; }
284 
285       bool
286       _M_match_range(_CharT __first, _CharT __last, _CharT __ch) const
287       {
288 	if _GLIBCXX17_CONSTEXPR (!__icase)
289 	  return __first <= __ch && __ch <= __last;
290 	else
291 	  return this->_M_in_range_icase(__first, __last, __ch);
292       }
293     };
294 
295   template<typename _CharType>
296     class _RegexTranslator<std::regex_traits<_CharType>, true, true>
297     : public _RegexTranslatorBase<std::regex_traits<_CharType>, true, true>
298     {
299     public:
300       typedef _RegexTranslatorBase<std::regex_traits<_CharType>, true, true>
301 	_Base;
302       typedef typename _Base::_CharT _CharT;
303       typedef typename _Base::_StrTransT _StrTransT;
304 
305       using _Base::_Base;
306 
307       bool
308       _M_match_range(const _StrTransT& __first, const _StrTransT& __last,
309 		     const _StrTransT& __str) const
310       {
311 	__glibcxx_assert(__first.size() == 1);
312 	__glibcxx_assert(__last.size() == 1);
313 	__glibcxx_assert(__str.size() == 1);
314 	return this->_M_in_range_icase(__first[0], __last[0], __str[0]);
315       }
316     };
317 
318   template<typename _TraitsT>
319     class _RegexTranslator<_TraitsT, false, false>
320     {
321     public:
322       typedef typename _TraitsT::char_type _CharT;
323       typedef _CharT                       _StrTransT;
324 
325       explicit
326       _RegexTranslator(const _TraitsT&)
327       { }
328 
329       _CharT
330       _M_translate(_CharT __ch) const
331       { return __ch; }
332 
333       _StrTransT
334       _M_transform(_CharT __ch) const
335       { return __ch; }
336 
337       bool
338       _M_match_range(_CharT __first, _CharT __last, _CharT __ch) const
339       { return __first <= __ch && __ch <= __last; }
340     };
341 
342   template<typename _TraitsT, bool __is_ecma, bool __icase, bool __collate>
343     struct _AnyMatcher;
344 
345   template<typename _TraitsT, bool __icase, bool __collate>
346     struct _AnyMatcher<_TraitsT, false, __icase, __collate>
347     {
348       typedef _RegexTranslator<_TraitsT, __icase, __collate> _TransT;
349       typedef typename _TransT::_CharT                       _CharT;
350 
351       explicit
352       _AnyMatcher(const _TraitsT& __traits)
353       : _M_translator(__traits)
354       { }
355 
356       bool
357       operator()(_CharT __ch) const
358       {
359 	static auto __nul = _M_translator._M_translate('\0');
360 	return _M_translator._M_translate(__ch) != __nul;
361       }
362 
363       _TransT _M_translator;
364     };
365 
366   template<typename _TraitsT, bool __icase, bool __collate>
367     struct _AnyMatcher<_TraitsT, true, __icase, __collate>
368     {
369       typedef _RegexTranslator<_TraitsT, __icase, __collate> _TransT;
370       typedef typename _TransT::_CharT                       _CharT;
371 
372       explicit
373       _AnyMatcher(const _TraitsT& __traits)
374       : _M_translator(__traits)
375       { }
376 
377       bool
378       operator()(_CharT __ch) const
379       { return _M_apply(__ch, typename is_same<_CharT, char>::type()); }
380 
381       bool
382       _M_apply(_CharT __ch, true_type) const
383       {
384 	auto __c = _M_translator._M_translate(__ch);
385 	auto __n = _M_translator._M_translate('\n');
386 	auto __r = _M_translator._M_translate('\r');
387 	return __c != __n && __c != __r;
388       }
389 
390       bool
391       _M_apply(_CharT __ch, false_type) const
392       {
393 	auto __c = _M_translator._M_translate(__ch);
394 	auto __n = _M_translator._M_translate('\n');
395 	auto __r = _M_translator._M_translate('\r');
396 	auto __u2028 = _M_translator._M_translate(u'\u2028');
397 	auto __u2029 = _M_translator._M_translate(u'\u2029');
398 	return __c != __n && __c != __r && __c != __u2028 && __c != __u2029;
399       }
400 
401       _TransT _M_translator;
402     };
403 
404   template<typename _TraitsT, bool __icase, bool __collate>
405     struct _CharMatcher
406     {
407       typedef _RegexTranslator<_TraitsT, __icase, __collate> _TransT;
408       typedef typename _TransT::_CharT                       _CharT;
409 
410       _CharMatcher(_CharT __ch, const _TraitsT& __traits)
411       : _M_translator(__traits), _M_ch(_M_translator._M_translate(__ch))
412       { }
413 
414       bool
415       operator()(_CharT __ch) const
416       { return _M_ch == _M_translator._M_translate(__ch); }
417 
418       _TransT _M_translator;
419       _CharT  _M_ch;
420     };
421 
422   /// Matches a character range (bracket expression)
423   template<typename _TraitsT, bool __icase, bool __collate>
424     struct _BracketMatcher
425     {
426     public:
427       typedef _RegexTranslator<_TraitsT, __icase, __collate> _TransT;
428       typedef typename _TransT::_CharT                       _CharT;
429       typedef typename _TransT::_StrTransT                   _StrTransT;
430       typedef typename _TraitsT::string_type                 _StringT;
431       typedef typename _TraitsT::char_class_type             _CharClassT;
432 
433     public:
434       _BracketMatcher(bool __is_non_matching,
435 		      const _TraitsT& __traits)
436       : _M_class_set(0), _M_translator(__traits), _M_traits(__traits),
437       _M_is_non_matching(__is_non_matching)
438       { }
439 
440       bool
441       operator()(_CharT __ch) const
442       {
443 	_GLIBCXX_DEBUG_ASSERT(_M_is_ready);
444 	return _M_apply(__ch, _UseCache());
445       }
446 
447       void
448       _M_add_char(_CharT __c)
449       {
450 	_M_char_set.push_back(_M_translator._M_translate(__c));
451 	_GLIBCXX_DEBUG_ONLY(_M_is_ready = false);
452       }
453 
454       _StringT
455       _M_add_collate_element(const _StringT& __s)
456       {
457 	auto __st = _M_traits.lookup_collatename(__s.data(),
458 						 __s.data() + __s.size());
459 	if (__st.empty())
460 	  __throw_regex_error(regex_constants::error_collate,
461 			      "Invalid collate element.");
462 	_M_char_set.push_back(_M_translator._M_translate(__st[0]));
463 	_GLIBCXX_DEBUG_ONLY(_M_is_ready = false);
464 	return __st;
465       }
466 
467       void
468       _M_add_equivalence_class(const _StringT& __s)
469       {
470 	auto __st = _M_traits.lookup_collatename(__s.data(),
471 						 __s.data() + __s.size());
472 	if (__st.empty())
473 	  __throw_regex_error(regex_constants::error_collate,
474 			      "Invalid equivalence class.");
475 	__st = _M_traits.transform_primary(__st.data(),
476 					   __st.data() + __st.size());
477 	_M_equiv_set.push_back(__st);
478 	_GLIBCXX_DEBUG_ONLY(_M_is_ready = false);
479       }
480 
481       // __neg should be true for \D, \S and \W only.
482       void
483       _M_add_character_class(const _StringT& __s, bool __neg)
484       {
485 	auto __mask = _M_traits.lookup_classname(__s.data(),
486 						 __s.data() + __s.size(),
487 						 __icase);
488 	if (__mask == 0)
489 	  __throw_regex_error(regex_constants::error_collate,
490 			      "Invalid character class.");
491 	if (!__neg)
492 	  _M_class_set |= __mask;
493 	else
494 	  _M_neg_class_set.push_back(__mask);
495 	_GLIBCXX_DEBUG_ONLY(_M_is_ready = false);
496       }
497 
498       void
499       _M_make_range(_CharT __l, _CharT __r)
500       {
501 	if (__l > __r)
502 	  __throw_regex_error(regex_constants::error_range,
503 			      "Invalid range in bracket expression.");
504 	_M_range_set.push_back(make_pair(_M_translator._M_transform(__l),
505 					 _M_translator._M_transform(__r)));
506 	_GLIBCXX_DEBUG_ONLY(_M_is_ready = false);
507       }
508 
509       void
510       _M_ready()
511       {
512 	std::sort(_M_char_set.begin(), _M_char_set.end());
513 	auto __end = std::unique(_M_char_set.begin(), _M_char_set.end());
514 	_M_char_set.erase(__end, _M_char_set.end());
515 	_M_make_cache(_UseCache());
516 	_GLIBCXX_DEBUG_ONLY(_M_is_ready = true);
517       }
518 
519     private:
520       // Currently we only use the cache for char
521       using _UseCache = typename std::is_same<_CharT, char>::type;
522 
523       static constexpr size_t
524       _S_cache_size =
525 	1ul << (sizeof(_CharT) * __CHAR_BIT__ * int(_UseCache::value));
526 
527       struct _Dummy { };
528       using _CacheT = std::__conditional_t<_UseCache::value,
529 					   std::bitset<_S_cache_size>,
530 					   _Dummy>;
531       using _UnsignedCharT = typename std::make_unsigned<_CharT>::type;
532 
533       bool
534       _M_apply(_CharT __ch, false_type) const;
535 
536       bool
537       _M_apply(_CharT __ch, true_type) const
538       { return _M_cache[static_cast<_UnsignedCharT>(__ch)]; }
539 
540       void
541       _M_make_cache(true_type)
542       {
543 	for (unsigned __i = 0; __i < _M_cache.size(); __i++)
544 	  _M_cache[__i] = _M_apply(static_cast<_CharT>(__i), false_type());
545       }
546 
547       void
548       _M_make_cache(false_type)
549       { }
550 
551     private:
552       _GLIBCXX_STD_C::vector<_CharT>            _M_char_set;
553       _GLIBCXX_STD_C::vector<_StringT>          _M_equiv_set;
554       _GLIBCXX_STD_C::vector<pair<_StrTransT, _StrTransT>> _M_range_set;
555       _GLIBCXX_STD_C::vector<_CharClassT>       _M_neg_class_set;
556       _CharClassT                               _M_class_set;
557       _TransT                                   _M_translator;
558       const _TraitsT&                           _M_traits;
559       bool                                      _M_is_non_matching;
560       _CacheT					_M_cache;
561 #ifdef _GLIBCXX_DEBUG
562       bool                                      _M_is_ready = false;
563 #endif
564     };
565 
566  ///@} regex-detail
567 } // namespace __detail
568 _GLIBCXX_END_NAMESPACE_VERSION
569 } // namespace std
570 
571 #include <bits/regex_compiler.tcc>
572