1 // class template regex -*- C++ -*- 2 3 // Copyright (C) 2010-2020 Free Software Foundation, Inc. 4 // 5 // This file is part of the GNU ISO C++ Library. This library is free 6 // software; you can redistribute it and/or modify it under the 7 // terms of the GNU General Public License as published by the 8 // Free Software Foundation; either version 3, or (at your option) 9 // any later version. 10 11 // This library is distributed in the hope that it will be useful, 12 // but WITHOUT ANY WARRANTY; without even the implied warranty of 13 // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 14 // GNU General Public License for more details. 15 16 // Under Section 7 of GPL version 3, you are granted additional 17 // permissions described in the GCC Runtime Library Exception, version 18 // 3.1, as published by the Free Software Foundation. 19 20 // You should have received a copy of the GNU General Public License and 21 // a copy of the GCC Runtime Library Exception along with this program; 22 // see the files COPYING3 and COPYING.RUNTIME respectively. If not, see 23 // <http://www.gnu.org/licenses/>. 24 25 /** 26 * @file bits/regex_compiler.h 27 * This is an internal header file, included by other library headers. 28 * Do not attempt to use it directly. @headername{regex} 29 */ 30 31 namespace std _GLIBCXX_VISIBILITY(default) 32 { 33 _GLIBCXX_BEGIN_NAMESPACE_VERSION 34 _GLIBCXX_BEGIN_NAMESPACE_CXX11 35 36 template<typename> 37 class regex_traits; 38 39 _GLIBCXX_END_NAMESPACE_CXX11 40 41 namespace __detail 42 { 43 /** 44 * @addtogroup regex-detail 45 * @{ 46 */ 47 48 template<typename, bool, bool> 49 struct _BracketMatcher; 50 51 /** 52 * @brief Builds an NFA from an input iterator range. 53 * 54 * The %_TraitsT type should fulfill requirements [28.3]. 55 */ 56 template<typename _TraitsT> 57 class _Compiler 58 { 59 public: 60 typedef typename _TraitsT::char_type _CharT; 61 typedef const _CharT* _IterT; 62 typedef _NFA<_TraitsT> _RegexT; 63 typedef regex_constants::syntax_option_type _FlagT; 64 65 _Compiler(_IterT __b, _IterT __e, 66 const typename _TraitsT::locale_type& __traits, _FlagT __flags); 67 68 shared_ptr<const _RegexT> 69 _M_get_nfa() noexcept 70 { return std::move(_M_nfa); } 71 72 private: 73 typedef _Scanner<_CharT> _ScannerT; 74 typedef typename _TraitsT::string_type _StringT; 75 typedef typename _ScannerT::_TokenT _TokenT; 76 typedef _StateSeq<_TraitsT> _StateSeqT; 77 typedef std::stack<_StateSeqT> _StackT; 78 typedef std::ctype<_CharT> _CtypeT; 79 80 // accepts a specific token or returns false. 81 bool 82 _M_match_token(_TokenT __token); 83 84 void 85 _M_disjunction(); 86 87 void 88 _M_alternative(); 89 90 bool 91 _M_term(); 92 93 bool 94 _M_assertion(); 95 96 bool 97 _M_quantifier(); 98 99 bool 100 _M_atom(); 101 102 bool 103 _M_bracket_expression(); 104 105 template<bool __icase, bool __collate> 106 void 107 _M_insert_any_matcher_ecma(); 108 109 template<bool __icase, bool __collate> 110 void 111 _M_insert_any_matcher_posix(); 112 113 template<bool __icase, bool __collate> 114 void 115 _M_insert_char_matcher(); 116 117 template<bool __icase, bool __collate> 118 void 119 _M_insert_character_class_matcher(); 120 121 template<bool __icase, bool __collate> 122 void 123 _M_insert_bracket_matcher(bool __neg); 124 125 // Cache of the last atom seen in a bracketed range expression. 126 struct _BracketState 127 { 128 enum class _Type : char { _None, _Char, _Class } _M_type = _Type::_None; 129 _CharT _M_char; 130 131 void 132 set(_CharT __c) noexcept { _M_type = _Type::_Char; _M_char = __c; } 133 134 _GLIBCXX_NODISCARD _CharT 135 get() const noexcept { return _M_char; } 136 137 void 138 reset(_Type __t = _Type::_None) noexcept { _M_type = __t; } 139 140 explicit operator bool() const noexcept 141 { return _M_type != _Type::_None; } 142 143 // Previous token was a single character. 144 _GLIBCXX_NODISCARD bool 145 _M_is_char() const noexcept { return _M_type == _Type::_Char; } 146 147 // Previous token was a character class, equivalent class, 148 // collating symbol etc. 149 _GLIBCXX_NODISCARD bool 150 _M_is_class() const noexcept { return _M_type == _Type::_Class; } 151 }; 152 153 template<bool __icase, bool __collate> 154 using _BracketMatcher 155 = std::__detail::_BracketMatcher<_TraitsT, __icase, __collate>; 156 157 // Returns true if successfully parsed one term and should continue 158 // compiling a bracket expression. 159 // Returns false if the compiler should move on. 160 template<bool __icase, bool __collate> 161 bool 162 _M_expression_term(_BracketState& __last_char, 163 _BracketMatcher<__icase, __collate>& __matcher); 164 165 int 166 _M_cur_int_value(int __radix); 167 168 bool 169 _M_try_char(); 170 171 _StateSeqT 172 _M_pop() 173 { 174 auto ret = _M_stack.top(); 175 _M_stack.pop(); 176 return ret; 177 } 178 179 static _FlagT 180 _S_validate(_FlagT __f) 181 { 182 using namespace regex_constants; 183 switch (__f & (ECMAScript|basic|extended|awk|grep|egrep)) 184 { 185 case ECMAScript: 186 case basic: 187 case extended: 188 case awk: 189 case grep: 190 case egrep: 191 return __f; 192 case _FlagT(0): 193 return __f | ECMAScript; 194 default: 195 std::__throw_regex_error(_S_grammar, "conflicting grammar options"); 196 } 197 } 198 199 _FlagT _M_flags; 200 _ScannerT _M_scanner; 201 shared_ptr<_RegexT> _M_nfa; 202 _StringT _M_value; 203 _StackT _M_stack; 204 const _TraitsT& _M_traits; 205 const _CtypeT& _M_ctype; 206 }; 207 208 template<typename _Tp> 209 struct __is_contiguous_iter : is_pointer<_Tp>::type { }; 210 211 template<typename _Tp, typename _Cont> 212 struct 213 __is_contiguous_iter<__gnu_cxx::__normal_iterator<_Tp*, _Cont>> 214 : true_type { }; 215 216 template<typename _Iter, typename _TraitsT> 217 using __enable_if_contiguous_iter 218 = __enable_if_t< __is_contiguous_iter<_Iter>::value, 219 std::shared_ptr<const _NFA<_TraitsT>> >; 220 221 template<typename _Iter, typename _TraitsT> 222 using __disable_if_contiguous_iter 223 = __enable_if_t< !__is_contiguous_iter<_Iter>::value, 224 std::shared_ptr<const _NFA<_TraitsT>> >; 225 226 template<typename _TraitsT, typename _FwdIter> 227 inline __enable_if_contiguous_iter<_FwdIter, _TraitsT> 228 __compile_nfa(_FwdIter __first, _FwdIter __last, 229 const typename _TraitsT::locale_type& __loc, 230 regex_constants::syntax_option_type __flags) 231 { 232 size_t __len = __last - __first; 233 const auto* __cfirst = __len ? std::__addressof(*__first) : nullptr; 234 using _Cmplr = _Compiler<_TraitsT>; 235 return _Cmplr(__cfirst, __cfirst + __len, __loc, __flags)._M_get_nfa(); 236 } 237 238 template<typename _TraitsT, typename _FwdIter> 239 inline __disable_if_contiguous_iter<_FwdIter, _TraitsT> 240 __compile_nfa(_FwdIter __first, _FwdIter __last, 241 const typename _TraitsT::locale_type& __loc, 242 regex_constants::syntax_option_type __flags) 243 { 244 const basic_string<typename _TraitsT::char_type> __str(__first, __last); 245 return __compile_nfa<_TraitsT>(__str.data(), __str.data() + __str.size(), 246 __loc, __flags); 247 } 248 249 // [28.13.14] 250 template<typename _TraitsT, bool __icase, bool __collate> 251 class _RegexTranslatorBase 252 { 253 public: 254 typedef typename _TraitsT::char_type _CharT; 255 typedef typename _TraitsT::string_type _StringT; 256 typedef _StringT _StrTransT; 257 258 explicit 259 _RegexTranslatorBase(const _TraitsT& __traits) 260 : _M_traits(__traits) 261 { } 262 263 _CharT 264 _M_translate(_CharT __ch) const 265 { 266 if (__icase) 267 return _M_traits.translate_nocase(__ch); 268 else if (__collate) 269 return _M_traits.translate(__ch); 270 else 271 return __ch; 272 } 273 274 _StrTransT 275 _M_transform(_CharT __ch) const 276 { 277 _StrTransT __str(1, __ch); 278 return _M_traits.transform(__str.begin(), __str.end()); 279 } 280 281 // See LWG 523. It's not efficiently implementable when _TraitsT is not 282 // std::regex_traits<>, and __collate is true. See specializations for 283 // implementations of other cases. 284 bool 285 _M_match_range(const _StrTransT& __first, const _StrTransT& __last, 286 const _StrTransT& __s) const 287 { return __first <= __s && __s <= __last; } 288 289 protected: 290 bool _M_in_range_icase(_CharT __first, _CharT __last, _CharT __ch) const 291 { 292 typedef std::ctype<_CharT> __ctype_type; 293 const auto& __fctyp = use_facet<__ctype_type>(this->_M_traits.getloc()); 294 auto __lower = __fctyp.tolower(__ch); 295 auto __upper = __fctyp.toupper(__ch); 296 return (__first <= __lower && __lower <= __last) 297 || (__first <= __upper && __upper <= __last); 298 } 299 300 const _TraitsT& _M_traits; 301 }; 302 303 template<typename _TraitsT, bool __icase, bool __collate> 304 class _RegexTranslator 305 : public _RegexTranslatorBase<_TraitsT, __icase, __collate> 306 { 307 public: 308 typedef _RegexTranslatorBase<_TraitsT, __icase, __collate> _Base; 309 using _Base::_Base; 310 }; 311 312 template<typename _TraitsT, bool __icase> 313 class _RegexTranslator<_TraitsT, __icase, false> 314 : public _RegexTranslatorBase<_TraitsT, __icase, false> 315 { 316 public: 317 typedef _RegexTranslatorBase<_TraitsT, __icase, false> _Base; 318 typedef typename _Base::_CharT _CharT; 319 typedef _CharT _StrTransT; 320 321 using _Base::_Base; 322 323 _StrTransT 324 _M_transform(_CharT __ch) const 325 { return __ch; } 326 327 bool 328 _M_match_range(_CharT __first, _CharT __last, _CharT __ch) const 329 { 330 if (!__icase) 331 return __first <= __ch && __ch <= __last; 332 return this->_M_in_range_icase(__first, __last, __ch); 333 } 334 }; 335 336 template<typename _CharType> 337 class _RegexTranslator<std::regex_traits<_CharType>, true, true> 338 : public _RegexTranslatorBase<std::regex_traits<_CharType>, true, true> 339 { 340 public: 341 typedef _RegexTranslatorBase<std::regex_traits<_CharType>, true, true> 342 _Base; 343 typedef typename _Base::_CharT _CharT; 344 typedef typename _Base::_StrTransT _StrTransT; 345 346 using _Base::_Base; 347 348 bool 349 _M_match_range(const _StrTransT& __first, const _StrTransT& __last, 350 const _StrTransT& __str) const 351 { 352 __glibcxx_assert(__first.size() == 1); 353 __glibcxx_assert(__last.size() == 1); 354 __glibcxx_assert(__str.size() == 1); 355 return this->_M_in_range_icase(__first[0], __last[0], __str[0]); 356 } 357 }; 358 359 template<typename _TraitsT> 360 class _RegexTranslator<_TraitsT, false, false> 361 { 362 public: 363 typedef typename _TraitsT::char_type _CharT; 364 typedef _CharT _StrTransT; 365 366 explicit 367 _RegexTranslator(const _TraitsT&) 368 { } 369 370 _CharT 371 _M_translate(_CharT __ch) const 372 { return __ch; } 373 374 _StrTransT 375 _M_transform(_CharT __ch) const 376 { return __ch; } 377 378 bool 379 _M_match_range(_CharT __first, _CharT __last, _CharT __ch) const 380 { return __first <= __ch && __ch <= __last; } 381 }; 382 383 template<typename _TraitsT, bool __is_ecma, bool __icase, bool __collate> 384 struct _AnyMatcher; 385 386 template<typename _TraitsT, bool __icase, bool __collate> 387 struct _AnyMatcher<_TraitsT, false, __icase, __collate> 388 { 389 typedef _RegexTranslator<_TraitsT, __icase, __collate> _TransT; 390 typedef typename _TransT::_CharT _CharT; 391 392 explicit 393 _AnyMatcher(const _TraitsT& __traits) 394 : _M_translator(__traits) 395 { } 396 397 bool 398 operator()(_CharT __ch) const 399 { 400 static auto __nul = _M_translator._M_translate('\0'); 401 return _M_translator._M_translate(__ch) != __nul; 402 } 403 404 _TransT _M_translator; 405 }; 406 407 template<typename _TraitsT, bool __icase, bool __collate> 408 struct _AnyMatcher<_TraitsT, true, __icase, __collate> 409 { 410 typedef _RegexTranslator<_TraitsT, __icase, __collate> _TransT; 411 typedef typename _TransT::_CharT _CharT; 412 413 explicit 414 _AnyMatcher(const _TraitsT& __traits) 415 : _M_translator(__traits) 416 { } 417 418 bool 419 operator()(_CharT __ch) const 420 { return _M_apply(__ch, typename is_same<_CharT, char>::type()); } 421 422 bool 423 _M_apply(_CharT __ch, true_type) const 424 { 425 auto __c = _M_translator._M_translate(__ch); 426 auto __n = _M_translator._M_translate('\n'); 427 auto __r = _M_translator._M_translate('\r'); 428 return __c != __n && __c != __r; 429 } 430 431 bool 432 _M_apply(_CharT __ch, false_type) const 433 { 434 auto __c = _M_translator._M_translate(__ch); 435 auto __n = _M_translator._M_translate('\n'); 436 auto __r = _M_translator._M_translate('\r'); 437 auto __u2028 = _M_translator._M_translate(u'\u2028'); 438 auto __u2029 = _M_translator._M_translate(u'\u2029'); 439 return __c != __n && __c != __r && __c != __u2028 && __c != __u2029; 440 } 441 442 _TransT _M_translator; 443 }; 444 445 template<typename _TraitsT, bool __icase, bool __collate> 446 struct _CharMatcher 447 { 448 typedef _RegexTranslator<_TraitsT, __icase, __collate> _TransT; 449 typedef typename _TransT::_CharT _CharT; 450 451 _CharMatcher(_CharT __ch, const _TraitsT& __traits) 452 : _M_translator(__traits), _M_ch(_M_translator._M_translate(__ch)) 453 { } 454 455 bool 456 operator()(_CharT __ch) const 457 { return _M_ch == _M_translator._M_translate(__ch); } 458 459 _TransT _M_translator; 460 _CharT _M_ch; 461 }; 462 463 /// Matches a character range (bracket expression) 464 template<typename _TraitsT, bool __icase, bool __collate> 465 struct _BracketMatcher 466 { 467 public: 468 typedef _RegexTranslator<_TraitsT, __icase, __collate> _TransT; 469 typedef typename _TransT::_CharT _CharT; 470 typedef typename _TransT::_StrTransT _StrTransT; 471 typedef typename _TraitsT::string_type _StringT; 472 typedef typename _TraitsT::char_class_type _CharClassT; 473 474 public: 475 _BracketMatcher(bool __is_non_matching, 476 const _TraitsT& __traits) 477 : _M_class_set(0), _M_translator(__traits), _M_traits(__traits), 478 _M_is_non_matching(__is_non_matching) 479 { } 480 481 bool 482 operator()(_CharT __ch) const 483 { 484 _GLIBCXX_DEBUG_ASSERT(_M_is_ready); 485 return _M_apply(__ch, _UseCache()); 486 } 487 488 void 489 _M_add_char(_CharT __c) 490 { 491 _M_char_set.push_back(_M_translator._M_translate(__c)); 492 _GLIBCXX_DEBUG_ONLY(_M_is_ready = false); 493 } 494 495 _StringT 496 _M_add_collate_element(const _StringT& __s) 497 { 498 auto __st = _M_traits.lookup_collatename(__s.data(), 499 __s.data() + __s.size()); 500 if (__st.empty()) 501 __throw_regex_error(regex_constants::error_collate, 502 "Invalid collate element."); 503 _M_char_set.push_back(_M_translator._M_translate(__st[0])); 504 _GLIBCXX_DEBUG_ONLY(_M_is_ready = false); 505 return __st; 506 } 507 508 void 509 _M_add_equivalence_class(const _StringT& __s) 510 { 511 auto __st = _M_traits.lookup_collatename(__s.data(), 512 __s.data() + __s.size()); 513 if (__st.empty()) 514 __throw_regex_error(regex_constants::error_collate, 515 "Invalid equivalence class."); 516 __st = _M_traits.transform_primary(__st.data(), 517 __st.data() + __st.size()); 518 _M_equiv_set.push_back(__st); 519 _GLIBCXX_DEBUG_ONLY(_M_is_ready = false); 520 } 521 522 // __neg should be true for \D, \S and \W only. 523 void 524 _M_add_character_class(const _StringT& __s, bool __neg) 525 { 526 auto __mask = _M_traits.lookup_classname(__s.data(), 527 __s.data() + __s.size(), 528 __icase); 529 if (__mask == 0) 530 __throw_regex_error(regex_constants::error_collate, 531 "Invalid character class."); 532 if (!__neg) 533 _M_class_set |= __mask; 534 else 535 _M_neg_class_set.push_back(__mask); 536 _GLIBCXX_DEBUG_ONLY(_M_is_ready = false); 537 } 538 539 void 540 _M_make_range(_CharT __l, _CharT __r) 541 { 542 if (__l > __r) 543 __throw_regex_error(regex_constants::error_range, 544 "Invalid range in bracket expression."); 545 _M_range_set.push_back(make_pair(_M_translator._M_transform(__l), 546 _M_translator._M_transform(__r))); 547 _GLIBCXX_DEBUG_ONLY(_M_is_ready = false); 548 } 549 550 void 551 _M_ready() 552 { 553 std::sort(_M_char_set.begin(), _M_char_set.end()); 554 auto __end = std::unique(_M_char_set.begin(), _M_char_set.end()); 555 _M_char_set.erase(__end, _M_char_set.end()); 556 _M_make_cache(_UseCache()); 557 _GLIBCXX_DEBUG_ONLY(_M_is_ready = true); 558 } 559 560 private: 561 // Currently we only use the cache for char 562 typedef typename std::is_same<_CharT, char>::type _UseCache; 563 564 static constexpr size_t 565 _S_cache_size = 566 1ul << (sizeof(_CharT) * __CHAR_BIT__ * int(_UseCache::value)); 567 568 struct _Dummy { }; 569 typedef typename std::conditional<_UseCache::value, 570 std::bitset<_S_cache_size>, 571 _Dummy>::type _CacheT; 572 typedef typename std::make_unsigned<_CharT>::type _UnsignedCharT; 573 574 bool 575 _M_apply(_CharT __ch, false_type) const; 576 577 bool 578 _M_apply(_CharT __ch, true_type) const 579 { return _M_cache[static_cast<_UnsignedCharT>(__ch)]; } 580 581 void 582 _M_make_cache(true_type) 583 { 584 for (unsigned __i = 0; __i < _M_cache.size(); __i++) 585 _M_cache[__i] = _M_apply(static_cast<_CharT>(__i), false_type()); 586 } 587 588 void 589 _M_make_cache(false_type) 590 { } 591 592 private: 593 std::vector<_CharT> _M_char_set; 594 std::vector<_StringT> _M_equiv_set; 595 std::vector<pair<_StrTransT, _StrTransT>> _M_range_set; 596 std::vector<_CharClassT> _M_neg_class_set; 597 _CharClassT _M_class_set; 598 _TransT _M_translator; 599 const _TraitsT& _M_traits; 600 bool _M_is_non_matching; 601 _CacheT _M_cache; 602 #ifdef _GLIBCXX_DEBUG 603 bool _M_is_ready = false; 604 #endif 605 }; 606 607 ///@} regex-detail 608 } // namespace __detail 609 _GLIBCXX_END_NAMESPACE_VERSION 610 } // namespace std 611 612 #include <bits/regex_compiler.tcc> 613