1 // class template regex -*- C++ -*- 2 3 // Copyright (C) 2013-2015 Free Software Foundation, Inc. 4 // 5 // This file is part of the GNU ISO C++ Library. This library is free 6 // software; you can redistribute it and/or modify it under the 7 // terms of the GNU General Public License as published by the 8 // Free Software Foundation; either version 3, or (at your option) 9 // any later version. 10 11 // This library is distributed in the hope that it will be useful, 12 // but WITHOUT ANY WARRANTY; without even the implied warranty of 13 // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 14 // GNU General Public License for more details. 15 16 // Under Section 7 of GPL version 3, you are granted additional 17 // permissions described in the GCC Runtime Library Exception, version 18 // 3.1, as published by the Free Software Foundation. 19 20 // You should have received a copy of the GNU General Public License and 21 // a copy of the GCC Runtime Library Exception along with this program; 22 // see the files COPYING3 and COPYING.RUNTIME respectively. If not, see 23 // <http://www.gnu.org/licenses/>. 24 25 /** 26 * @file bits/regex_scanner.tcc 27 * This is an internal header file, included by other library headers. 28 * Do not attempt to use it directly. @headername{regex} 29 */ 30 31 // FIXME make comments doxygen format. 32 33 // N3376 specified 6 regex styles: ECMAScript, basic, extended, grep, egrep 34 // and awk 35 // 1) grep is basic except '\n' is treated as '|' 36 // 2) egrep is extended except '\n' is treated as '|' 37 // 3) awk is extended except special escaping rules, and there's no 38 // back-reference. 39 // 40 // References: 41 // 42 // ECMAScript: ECMA-262 15.10 43 // 44 // basic, extended: 45 // http://pubs.opengroup.org/onlinepubs/009695399/basedefs/xbd_chap09.html 46 // 47 // awk: http://pubs.opengroup.org/onlinepubs/000095399/utilities/awk.html 48 49 namespace std _GLIBCXX_VISIBILITY(default) 50 { 51 namespace __detail 52 { 53 _GLIBCXX_BEGIN_NAMESPACE_VERSION 54 55 template<typename _CharT> 56 _Scanner<_CharT>:: 57 _Scanner(typename _Scanner::_IterT __begin, 58 typename _Scanner::_IterT __end, 59 _FlagT __flags, std::locale __loc) 60 : _ScannerBase(__flags), 61 _M_current(__begin), _M_end(__end), 62 _M_ctype(std::use_facet<_CtypeT>(__loc)), 63 _M_eat_escape(_M_is_ecma() 64 ? &_Scanner::_M_eat_escape_ecma 65 : &_Scanner::_M_eat_escape_posix) 66 { _M_advance(); } 67 68 template<typename _CharT> 69 void 70 _Scanner<_CharT>:: 71 _M_advance() 72 { 73 if (_M_current == _M_end) 74 { 75 _M_token = _S_token_eof; 76 return; 77 } 78 79 if (_M_state == _S_state_normal) 80 _M_scan_normal(); 81 else if (_M_state == _S_state_in_bracket) 82 _M_scan_in_bracket(); 83 else if (_M_state == _S_state_in_brace) 84 _M_scan_in_brace(); 85 else 86 { 87 _GLIBCXX_DEBUG_ASSERT(false); 88 } 89 } 90 91 // Differences between styles: 92 // 1) "\(", "\)", "\{" in basic. It's not escaping. 93 // 2) "(?:", "(?=", "(?!" in ECMAScript. 94 template<typename _CharT> 95 void 96 _Scanner<_CharT>:: 97 _M_scan_normal() 98 { 99 auto __c = *_M_current++; 100 const char* __pos; 101 102 if (std::strchr(_M_spec_char, _M_ctype.narrow(__c, '\0')) == nullptr) 103 { 104 _M_token = _S_token_ord_char; 105 _M_value.assign(1, __c); 106 return; 107 } 108 if (__c == '\\') 109 { 110 if (_M_current == _M_end) 111 __throw_regex_error(regex_constants::error_escape); 112 113 if (!_M_is_basic() 114 || (*_M_current != '(' 115 && *_M_current != ')' 116 && *_M_current != '{')) 117 { 118 (this->*_M_eat_escape)(); 119 return; 120 } 121 __c = *_M_current++; 122 } 123 if (__c == '(') 124 { 125 if (_M_is_ecma() && *_M_current == '?') 126 { 127 if (++_M_current == _M_end) 128 __throw_regex_error(regex_constants::error_paren); 129 130 if (*_M_current == ':') 131 { 132 ++_M_current; 133 _M_token = _S_token_subexpr_no_group_begin; 134 } 135 else if (*_M_current == '=') 136 { 137 ++_M_current; 138 _M_token = _S_token_subexpr_lookahead_begin; 139 _M_value.assign(1, 'p'); 140 } 141 else if (*_M_current == '!') 142 { 143 ++_M_current; 144 _M_token = _S_token_subexpr_lookahead_begin; 145 _M_value.assign(1, 'n'); 146 } 147 else 148 __throw_regex_error(regex_constants::error_paren); 149 } 150 else if (_M_flags & regex_constants::nosubs) 151 _M_token = _S_token_subexpr_no_group_begin; 152 else 153 _M_token = _S_token_subexpr_begin; 154 } 155 else if (__c == ')') 156 _M_token = _S_token_subexpr_end; 157 else if (__c == '[') 158 { 159 _M_state = _S_state_in_bracket; 160 _M_at_bracket_start = true; 161 if (_M_current != _M_end && *_M_current == '^') 162 { 163 _M_token = _S_token_bracket_neg_begin; 164 ++_M_current; 165 } 166 else 167 _M_token = _S_token_bracket_begin; 168 } 169 else if (__c == '{') 170 { 171 _M_state = _S_state_in_brace; 172 _M_token = _S_token_interval_begin; 173 } 174 else if (((__pos = std::strchr(_M_spec_char, _M_ctype.narrow(__c, '\0'))) 175 != nullptr 176 && *__pos != '\0' 177 && __c != ']' 178 && __c != '}') 179 || (_M_is_grep() && __c == '\n')) 180 { 181 auto __it = _M_token_tbl; 182 auto __narrowc = _M_ctype.narrow(__c, '\0'); 183 for (; __it->first != '\0'; ++__it) 184 if (__it->first == __narrowc) 185 { 186 _M_token = __it->second; 187 return; 188 } 189 _GLIBCXX_DEBUG_ASSERT(false); 190 } 191 else 192 { 193 _M_token = _S_token_ord_char; 194 _M_value.assign(1, __c); 195 } 196 } 197 198 // Differences between styles: 199 // 1) different semantics of "[]" and "[^]". 200 // 2) Escaping in bracket expr. 201 template<typename _CharT> 202 void 203 _Scanner<_CharT>:: 204 _M_scan_in_bracket() 205 { 206 if (_M_current == _M_end) 207 __throw_regex_error(regex_constants::error_brack); 208 209 auto __c = *_M_current++; 210 211 if (__c == '[') 212 { 213 if (_M_current == _M_end) 214 __throw_regex_error(regex_constants::error_brack); 215 216 if (*_M_current == '.') 217 { 218 _M_token = _S_token_collsymbol; 219 _M_eat_class(*_M_current++); 220 } 221 else if (*_M_current == ':') 222 { 223 _M_token = _S_token_char_class_name; 224 _M_eat_class(*_M_current++); 225 } 226 else if (*_M_current == '=') 227 { 228 _M_token = _S_token_equiv_class_name; 229 _M_eat_class(*_M_current++); 230 } 231 else 232 { 233 _M_token = _S_token_ord_char; 234 _M_value.assign(1, __c); 235 } 236 } 237 // In POSIX, when encountering "[]" or "[^]", the ']' is interpreted 238 // literally. So "[]]" and "[^]]" are valid regexes. See the testcases 239 // `*/empty_range.cc`. 240 else if (__c == ']' && (_M_is_ecma() || !_M_at_bracket_start)) 241 { 242 _M_token = _S_token_bracket_end; 243 _M_state = _S_state_normal; 244 } 245 // ECMAScript and awk permits escaping in bracket. 246 else if (__c == '\\' && (_M_is_ecma() || _M_is_awk())) 247 (this->*_M_eat_escape)(); 248 else 249 { 250 _M_token = _S_token_ord_char; 251 _M_value.assign(1, __c); 252 } 253 _M_at_bracket_start = false; 254 } 255 256 // Differences between styles: 257 // 1) "\}" in basic style. 258 template<typename _CharT> 259 void 260 _Scanner<_CharT>:: 261 _M_scan_in_brace() 262 { 263 if (_M_current == _M_end) 264 __throw_regex_error(regex_constants::error_brace); 265 266 auto __c = *_M_current++; 267 268 if (_M_ctype.is(_CtypeT::digit, __c)) 269 { 270 _M_token = _S_token_dup_count; 271 _M_value.assign(1, __c); 272 while (_M_current != _M_end 273 && _M_ctype.is(_CtypeT::digit, *_M_current)) 274 _M_value += *_M_current++; 275 } 276 else if (__c == ',') 277 _M_token = _S_token_comma; 278 // basic use \}. 279 else if (_M_is_basic()) 280 { 281 if (__c == '\\' && _M_current != _M_end && *_M_current == '}') 282 { 283 _M_state = _S_state_normal; 284 _M_token = _S_token_interval_end; 285 ++_M_current; 286 } 287 else 288 __throw_regex_error(regex_constants::error_badbrace); 289 } 290 else if (__c == '}') 291 { 292 _M_state = _S_state_normal; 293 _M_token = _S_token_interval_end; 294 } 295 else 296 __throw_regex_error(regex_constants::error_badbrace); 297 } 298 299 template<typename _CharT> 300 void 301 _Scanner<_CharT>:: 302 _M_eat_escape_ecma() 303 { 304 if (_M_current == _M_end) 305 __throw_regex_error(regex_constants::error_escape); 306 307 auto __c = *_M_current++; 308 auto __pos = _M_find_escape(_M_ctype.narrow(__c, '\0')); 309 310 if (__pos != nullptr && (__c != 'b' || _M_state == _S_state_in_bracket)) 311 { 312 _M_token = _S_token_ord_char; 313 _M_value.assign(1, *__pos); 314 } 315 else if (__c == 'b') 316 { 317 _M_token = _S_token_word_bound; 318 _M_value.assign(1, 'p'); 319 } 320 else if (__c == 'B') 321 { 322 _M_token = _S_token_word_bound; 323 _M_value.assign(1, 'n'); 324 } 325 // N3376 28.13 326 else if (__c == 'd' 327 || __c == 'D' 328 || __c == 's' 329 || __c == 'S' 330 || __c == 'w' 331 || __c == 'W') 332 { 333 _M_token = _S_token_quoted_class; 334 _M_value.assign(1, __c); 335 } 336 else if (__c == 'c') 337 { 338 if (_M_current == _M_end) 339 __throw_regex_error(regex_constants::error_escape); 340 _M_token = _S_token_ord_char; 341 _M_value.assign(1, *_M_current++); 342 } 343 else if (__c == 'x' || __c == 'u') 344 { 345 _M_value.erase(); 346 for (int __i = 0; __i < (__c == 'x' ? 2 : 4); __i++) 347 { 348 if (_M_current == _M_end 349 || !_M_ctype.is(_CtypeT::xdigit, *_M_current)) 350 __throw_regex_error(regex_constants::error_escape); 351 _M_value += *_M_current++; 352 } 353 _M_token = _S_token_hex_num; 354 } 355 // ECMAScript recognizes multi-digit back-references. 356 else if (_M_ctype.is(_CtypeT::digit, __c)) 357 { 358 _M_value.assign(1, __c); 359 while (_M_current != _M_end 360 && _M_ctype.is(_CtypeT::digit, *_M_current)) 361 _M_value += *_M_current++; 362 _M_token = _S_token_backref; 363 } 364 else 365 { 366 _M_token = _S_token_ord_char; 367 _M_value.assign(1, __c); 368 } 369 } 370 371 // Differences between styles: 372 // 1) Extended doesn't support backref, but basic does. 373 template<typename _CharT> 374 void 375 _Scanner<_CharT>:: 376 _M_eat_escape_posix() 377 { 378 if (_M_current == _M_end) 379 __throw_regex_error(regex_constants::error_escape); 380 381 auto __c = *_M_current; 382 auto __pos = std::strchr(_M_spec_char, _M_ctype.narrow(__c, '\0')); 383 384 if (__pos != nullptr && *__pos != '\0') 385 { 386 _M_token = _S_token_ord_char; 387 _M_value.assign(1, __c); 388 } 389 // We MUST judge awk before handling backrefs. There's no backref in awk. 390 else if (_M_is_awk()) 391 { 392 _M_eat_escape_awk(); 393 return; 394 } 395 else if (_M_is_basic() && _M_ctype.is(_CtypeT::digit, __c) && __c != '0') 396 { 397 _M_token = _S_token_backref; 398 _M_value.assign(1, __c); 399 } 400 else 401 { 402 #ifdef __STRICT_ANSI__ 403 // POSIX says it is undefined to escape ordinary characters 404 __throw_regex_error(regex_constants::error_escape); 405 #else 406 _M_token = _S_token_ord_char; 407 _M_value.assign(1, __c); 408 #endif 409 } 410 ++_M_current; 411 } 412 413 template<typename _CharT> 414 void 415 _Scanner<_CharT>:: 416 _M_eat_escape_awk() 417 { 418 auto __c = *_M_current++; 419 auto __pos = _M_find_escape(_M_ctype.narrow(__c, '\0')); 420 421 if (__pos != nullptr) 422 { 423 _M_token = _S_token_ord_char; 424 _M_value.assign(1, *__pos); 425 } 426 // \ddd for oct representation 427 else if (_M_ctype.is(_CtypeT::digit, __c) 428 && __c != '8' 429 && __c != '9') 430 { 431 _M_value.assign(1, __c); 432 for (int __i = 0; 433 __i < 2 434 && _M_current != _M_end 435 && _M_ctype.is(_CtypeT::digit, *_M_current) 436 && *_M_current != '8' 437 && *_M_current != '9'; 438 __i++) 439 _M_value += *_M_current++; 440 _M_token = _S_token_oct_num; 441 return; 442 } 443 else 444 __throw_regex_error(regex_constants::error_escape); 445 } 446 447 // Eats a character class or throws an exception. 448 // __ch could be ':', '.' or '=', _M_current is the char after ']' when 449 // returning. 450 template<typename _CharT> 451 void 452 _Scanner<_CharT>:: 453 _M_eat_class(char __ch) 454 { 455 for (_M_value.clear(); _M_current != _M_end && *_M_current != __ch;) 456 _M_value += *_M_current++; 457 if (_M_current == _M_end 458 || *_M_current++ != __ch 459 || _M_current == _M_end // skip __ch 460 || *_M_current++ != ']') // skip ']' 461 { 462 if (__ch == ':') 463 __throw_regex_error(regex_constants::error_ctype); 464 else 465 __throw_regex_error(regex_constants::error_collate); 466 } 467 } 468 469 #ifdef _GLIBCXX_DEBUG 470 template<typename _CharT> 471 std::ostream& 472 _Scanner<_CharT>:: 473 _M_print(std::ostream& ostr) 474 { 475 switch (_M_token) 476 { 477 case _S_token_anychar: 478 ostr << "any-character\n"; 479 break; 480 case _S_token_backref: 481 ostr << "backref\n"; 482 break; 483 case _S_token_bracket_begin: 484 ostr << "bracket-begin\n"; 485 break; 486 case _S_token_bracket_neg_begin: 487 ostr << "bracket-neg-begin\n"; 488 break; 489 case _S_token_bracket_end: 490 ostr << "bracket-end\n"; 491 break; 492 case _S_token_char_class_name: 493 ostr << "char-class-name \"" << _M_value << "\"\n"; 494 break; 495 case _S_token_closure0: 496 ostr << "closure0\n"; 497 break; 498 case _S_token_closure1: 499 ostr << "closure1\n"; 500 break; 501 case _S_token_collsymbol: 502 ostr << "collsymbol \"" << _M_value << "\"\n"; 503 break; 504 case _S_token_comma: 505 ostr << "comma\n"; 506 break; 507 case _S_token_dup_count: 508 ostr << "dup count: " << _M_value << "\n"; 509 break; 510 case _S_token_eof: 511 ostr << "EOF\n"; 512 break; 513 case _S_token_equiv_class_name: 514 ostr << "equiv-class-name \"" << _M_value << "\"\n"; 515 break; 516 case _S_token_interval_begin: 517 ostr << "interval begin\n"; 518 break; 519 case _S_token_interval_end: 520 ostr << "interval end\n"; 521 break; 522 case _S_token_line_begin: 523 ostr << "line begin\n"; 524 break; 525 case _S_token_line_end: 526 ostr << "line end\n"; 527 break; 528 case _S_token_opt: 529 ostr << "opt\n"; 530 break; 531 case _S_token_or: 532 ostr << "or\n"; 533 break; 534 case _S_token_ord_char: 535 ostr << "ordinary character: \"" << _M_value << "\"\n"; 536 break; 537 case _S_token_subexpr_begin: 538 ostr << "subexpr begin\n"; 539 break; 540 case _S_token_subexpr_no_group_begin: 541 ostr << "no grouping subexpr begin\n"; 542 break; 543 case _S_token_subexpr_lookahead_begin: 544 ostr << "lookahead subexpr begin\n"; 545 break; 546 case _S_token_subexpr_end: 547 ostr << "subexpr end\n"; 548 break; 549 case _S_token_unknown: 550 ostr << "-- unknown token --\n"; 551 break; 552 case _S_token_oct_num: 553 ostr << "oct number " << _M_value << "\n"; 554 break; 555 case _S_token_hex_num: 556 ostr << "hex number " << _M_value << "\n"; 557 break; 558 case _S_token_quoted_class: 559 ostr << "quoted class " << "\\" << _M_value << "\n"; 560 break; 561 default: 562 _GLIBCXX_DEBUG_ASSERT(false); 563 } 564 return ostr; 565 } 566 #endif 567 568 _GLIBCXX_END_NAMESPACE_VERSION 569 } // namespace __detail 570 } // namespace 571