1 /++ 2 $(LINK2 https://en.wikipedia.org/wiki/Regular_expression, Regular expressions) 3 are a commonly used method of pattern matching 4 on strings, with $(I regex) being a catchy word for a pattern in this domain 5 specific language. Typical problems usually solved by regular expressions 6 include validation of user input and the ubiquitous find $(AMP) replace 7 in text processing utilities. 8 9 $(SCRIPT inhibitQuickIndex = 1;) 10 $(BOOKTABLE, 11 $(TR $(TH Category) $(TH Functions)) 12 $(TR $(TD Matching) $(TD 13 $(LREF bmatch) 14 $(LREF match) 15 $(LREF matchAll) 16 $(LREF matchFirst) 17 )) 18 $(TR $(TD Building) $(TD 19 $(LREF ctRegex) 20 $(LREF escaper) 21 $(LREF _regex) 22 )) 23 $(TR $(TD Replace) $(TD 24 $(LREF replace) 25 $(LREF replaceAll) 26 $(LREF replaceAllInto) 27 $(LREF replaceFirst) 28 $(LREF replaceFirstInto) 29 )) 30 $(TR $(TD Split) $(TD 31 $(LREF split) 32 $(LREF splitter) 33 )) 34 $(TR $(TD Objects) $(TD 35 $(LREF Captures) 36 $(LREF Regex) 37 $(LREF RegexException) 38 $(LREF RegexMatch) 39 $(LREF Splitter) 40 $(LREF StaticRegex) 41 )) 42 ) 43 44 $(SECTION Synopsis) 45 --- 46 import std.regex; 47 import std.stdio; 48 void main() 49 { 50 // Print out all possible dd/mm/yy(yy) dates found in user input. 51 auto r = regex(r"\b[0-9][0-9]?/[0-9][0-9]?/[0-9][0-9](?:[0-9][0-9])?\b"); 52 foreach (line; stdin.byLine) 53 { 54 // matchAll() returns a range that can be iterated 55 // to get all subsequent matches. 56 foreach (c; matchAll(line, r)) 57 writeln(c.hit); 58 } 59 } 60 ... 61 62 // Create a static regex at compile-time, which contains fast native code. 63 auto ctr = ctRegex!(`^.*/([^/]+)/?$`); 64 65 // It works just like a normal regex: 66 auto c2 = matchFirst("foo/bar", ctr); // First match found here, if any 67 assert(!c2.empty); // Be sure to check if there is a match before examining contents! 68 assert(c2[1] == "bar"); // Captures is a range of submatches: 0 = full match. 69 70 ... 71 // multi-pattern regex 72 auto multi = regex([`\d+,\d+`,`(a-z]+):(\d+)`]); 73 auto m = "abc:43 12,34".matchAll(multi); 74 assert(m.front.whichPattern == 2); 75 assert(m.front[1] == "abc"); 76 assert(m.front[2] == "43"); 77 m.popFront(); 78 assert(m.front.whichPattern == 1); 79 assert(m.front[1] == "12"); 80 ... 81 82 // The result of the `matchAll/matchFirst` is directly testable with if/assert/while. 83 // e.g. test if a string consists of letters: 84 assert(matchFirst("Letter", `^\p{L}+$`)); 85 --- 86 87 $(SECTION Syntax and general information) 88 The general usage guideline is to keep regex complexity on the side of simplicity, 89 as its capabilities reside in purely character-level manipulation. 90 As such it's ill-suited for tasks involving higher level invariants 91 like matching an integer number $(U bounded) in an [a,b] interval. 92 Checks of this sort of are better addressed by additional post-processing. 93 94 The basic syntax shouldn't surprise experienced users of regular expressions. 95 For an introduction to $(D std.regex) see a 96 $(HTTP dlang.org/regular-expression.html, short tour) of the module API 97 and its abilities. 98 99 There are other web resources on regular expressions to help newcomers, 100 and a good $(HTTP www.regular-expressions.info, reference with tutorial) 101 can easily be found. 102 103 This library uses a remarkably common ECMAScript syntax flavor 104 with the following extensions: 105 $(UL 106 $(LI Named subexpressions, with Python syntax. ) 107 $(LI Unicode properties such as Scripts, Blocks and common binary properties e.g Alphabetic, White_Space, Hex_Digit etc.) 108 $(LI Arbitrary length and complexity lookbehind, including lookahead in lookbehind and vise-versa.) 109 ) 110 111 $(REG_START Pattern syntax ) 112 $(I std.regex operates on codepoint level, 113 'character' in this table denotes a single Unicode codepoint.) 114 $(REG_TABLE 115 $(REG_TITLE Pattern element, Semantics ) 116 $(REG_TITLE Atoms, Match single characters ) 117 $(REG_ROW any character except [{|*+?()^$, Matches the character itself. ) 118 $(REG_ROW ., In single line mode matches any character. 119 Otherwise it matches any character except '\n' and '\r'. ) 120 $(REG_ROW [class], Matches a single character 121 that belongs to this character class. ) 122 $(REG_ROW [^class], Matches a single character that 123 does $(U not) belong to this character class.) 124 $(REG_ROW \cC, Matches the control character corresponding to letter C) 125 $(REG_ROW \xXX, Matches a character with hexadecimal value of XX. ) 126 $(REG_ROW \uXXXX, Matches a character with hexadecimal value of XXXX. ) 127 $(REG_ROW \U00YYYYYY, Matches a character with hexadecimal value of YYYYYY. ) 128 $(REG_ROW \f, Matches a formfeed character. ) 129 $(REG_ROW \n, Matches a linefeed character. ) 130 $(REG_ROW \r, Matches a carriage return character. ) 131 $(REG_ROW \t, Matches a tab character. ) 132 $(REG_ROW \v, Matches a vertical tab character. ) 133 $(REG_ROW \d, Matches any Unicode digit. ) 134 $(REG_ROW \D, Matches any character except Unicode digits. ) 135 $(REG_ROW \w, Matches any word character (note: this includes numbers).) 136 $(REG_ROW \W, Matches any non-word character.) 137 $(REG_ROW \s, Matches whitespace, same as \p{White_Space}.) 138 $(REG_ROW \S, Matches any character except those recognized as $(I \s ). ) 139 $(REG_ROW \\, Matches \ character. ) 140 $(REG_ROW \c where c is one of [|*+?(), Matches the character c itself. ) 141 $(REG_ROW \p{PropertyName}, Matches a character that belongs 142 to the Unicode PropertyName set. 143 Single letter abbreviations can be used without surrounding {,}. ) 144 $(REG_ROW \P{PropertyName}, Matches a character that does not belong 145 to the Unicode PropertyName set. 146 Single letter abbreviations can be used without surrounding {,}. ) 147 $(REG_ROW \p{InBasicLatin}, Matches any character that is part of 148 the BasicLatin Unicode $(U block).) 149 $(REG_ROW \P{InBasicLatin}, Matches any character except ones in 150 the BasicLatin Unicode $(U block).) 151 $(REG_ROW \p{Cyrillic}, Matches any character that is part of 152 Cyrillic $(U script).) 153 $(REG_ROW \P{Cyrillic}, Matches any character except ones in 154 Cyrillic $(U script).) 155 $(REG_TITLE Quantifiers, Specify repetition of other elements) 156 $(REG_ROW *, Matches previous character/subexpression 0 or more times. 157 Greedy version - tries as many times as possible.) 158 $(REG_ROW *?, Matches previous character/subexpression 0 or more times. 159 Lazy version - stops as early as possible.) 160 $(REG_ROW +, Matches previous character/subexpression 1 or more times. 161 Greedy version - tries as many times as possible.) 162 $(REG_ROW +?, Matches previous character/subexpression 1 or more times. 163 Lazy version - stops as early as possible.) 164 $(REG_ROW {n}, Matches previous character/subexpression exactly n times. ) 165 $(REG_ROW {n$(COMMA)}, Matches previous character/subexpression n times or more. 166 Greedy version - tries as many times as possible. ) 167 $(REG_ROW {n$(COMMA)}?, Matches previous character/subexpression n times or more. 168 Lazy version - stops as early as possible.) 169 $(REG_ROW {n$(COMMA)m}, Matches previous character/subexpression n to m times. 170 Greedy version - tries as many times as possible, but no more than m times. ) 171 $(REG_ROW {n$(COMMA)m}?, Matches previous character/subexpression n to m times. 172 Lazy version - stops as early as possible, but no less then n times.) 173 $(REG_TITLE Other, Subexpressions $(AMP) alternations ) 174 $(REG_ROW (regex), Matches subexpression regex, 175 saving matched portion of text for later retrieval. ) 176 $(REG_ROW (?#comment), An inline comment that is ignored while matching.) 177 $(REG_ROW (?:regex), Matches subexpression regex, 178 $(U not) saving matched portion of text. Useful to speed up matching. ) 179 $(REG_ROW A|B, Matches subexpression A, or failing that, matches B. ) 180 $(REG_ROW (?P$(LT)name$(GT)regex), Matches named subexpression 181 regex labeling it with name 'name'. 182 When referring to a matched portion of text, 183 names work like aliases in addition to direct numbers. 184 ) 185 $(REG_TITLE Assertions, Match position rather than character ) 186 $(REG_ROW ^, Matches at the begining of input or line (in multiline mode).) 187 $(REG_ROW $, Matches at the end of input or line (in multiline mode). ) 188 $(REG_ROW \b, Matches at word boundary. ) 189 $(REG_ROW \B, Matches when $(U not) at word boundary. ) 190 $(REG_ROW (?=regex), Zero-width lookahead assertion. 191 Matches at a point where the subexpression 192 regex could be matched starting from the current position. 193 ) 194 $(REG_ROW (?!regex), Zero-width negative lookahead assertion. 195 Matches at a point where the subexpression 196 regex could $(U not) be matched starting from the current position. 197 ) 198 $(REG_ROW (?<=regex), Zero-width lookbehind assertion. Matches at a point 199 where the subexpression regex could be matched ending 200 at the current position (matching goes backwards). 201 ) 202 $(REG_ROW (?<!regex), Zero-width negative lookbehind assertion. 203 Matches at a point where the subexpression regex could $(U not) 204 be matched ending at the current position (matching goes backwards). 205 ) 206 ) 207 208 $(REG_START Character classes ) 209 $(REG_TABLE 210 $(REG_TITLE Pattern element, Semantics ) 211 $(REG_ROW Any atom, Has the same meaning as outside of a character class.) 212 $(REG_ROW a-z, Includes characters a, b, c, ..., z. ) 213 $(REG_ROW [a||b]$(COMMA) [a--b]$(COMMA) [a~~b]$(COMMA) [a$(AMP)$(AMP)b], 214 Where a, b are arbitrary classes, means union, set difference, 215 symmetric set difference, and intersection respectively. 216 $(I Any sequence of character class elements implicitly forms a union.) ) 217 ) 218 219 $(REG_START Regex flags ) 220 $(REG_TABLE 221 $(REG_TITLE Flag, Semantics ) 222 $(REG_ROW g, Global regex, repeat over the whole input. ) 223 $(REG_ROW i, Case insensitive matching. ) 224 $(REG_ROW m, Multi-line mode, match ^, $ on start and end line separators 225 as well as start and end of input.) 226 $(REG_ROW s, Single-line mode, makes . match '\n' and '\r' as well. ) 227 $(REG_ROW x, Free-form syntax, ignores whitespace in pattern, 228 useful for formatting complex regular expressions. ) 229 ) 230 231 $(SECTION Unicode support) 232 233 This library provides full Level 1 support* according to 234 $(HTTP unicode.org/reports/tr18/, UTS 18). Specifically: 235 $(UL 236 $(LI 1.1 Hex notation via any of \uxxxx, \U00YYYYYY, \xZZ.) 237 $(LI 1.2 Unicode properties.) 238 $(LI 1.3 Character classes with set operations.) 239 $(LI 1.4 Word boundaries use the full set of "word" characters.) 240 $(LI 1.5 Using simple casefolding to match case 241 insensitively across the full range of codepoints.) 242 $(LI 1.6 Respecting line breaks as any of 243 \u000A | \u000B | \u000C | \u000D | \u0085 | \u2028 | \u2029 | \u000D\u000A.) 244 $(LI 1.7 Operating on codepoint level.) 245 ) 246 *With exception of point 1.1.1, as of yet, normalization of input 247 is expected to be enforced by user. 248 249 $(SECTION Replace format string) 250 251 A set of functions in this module that do the substitution rely 252 on a simple format to guide the process. In particular the table below 253 applies to the $(D format) argument of 254 $(LREF replaceFirst) and $(LREF replaceAll). 255 256 The format string can reference parts of match using the following notation. 257 $(REG_TABLE 258 $(REG_TITLE Format specifier, Replaced by ) 259 $(REG_ROW $$(AMP), the whole match. ) 260 $(REG_ROW $(DOLLAR)$(BACKTICK), part of input $(I preceding) the match. ) 261 $(REG_ROW $', part of input $(I following) the match. ) 262 $(REG_ROW $$, '$' character. ) 263 $(REG_ROW \c $(COMMA) where c is any character, the character c itself. ) 264 $(REG_ROW \\, '\' character. ) 265 $(REG_ROW $(DOLLAR)1 .. $(DOLLAR)99, submatch number 1 to 99 respectively. ) 266 ) 267 268 $(SECTION Slicing and zero memory allocations orientation) 269 270 All matches returned by pattern matching functionality in this library 271 are slices of the original input. The notable exception is the $(D replace) 272 family of functions that generate a new string from the input. 273 274 In cases where producing the replacement is the ultimate goal 275 $(LREF replaceFirstInto) and $(LREF replaceAllInto) could come in handy 276 as functions that avoid allocations even for replacement. 277 278 Copyright: Copyright Dmitry Olshansky, 2011- 279 280 License: $(HTTP boost.org/LICENSE_1_0.txt, Boost License 1.0). 281 282 Authors: Dmitry Olshansky, 283 284 API and utility constructs are modeled after the original $(D std.regex) 285 by Walter Bright and Andrei Alexandrescu. 286 287 Source: $(PHOBOSSRC std/_regex/_package.d) 288 289 Macros: 290 REG_ROW = $(TR $(TD $(I $1 )) $(TD $+) ) 291 REG_TITLE = $(TR $(TD $(B $1)) $(TD $(B $2)) ) 292 REG_TABLE = <table border="1" cellspacing="0" cellpadding="5" > $0 </table> 293 REG_START = <h3><div align="center"> $0 </div></h3> 294 SECTION = <h3><a id="$1" href="#$1" class="anchor">$0</a></h3> 295 S_LINK = <a href="#$1">$+</a> 296 +/ 297 module std.regex; 298 299 import std.range.primitives, std.traits; 300 import std.regex.internal.ir; 301 import std.regex.internal.thompson; //TODO: get rid of this dependency 302 import std.typecons; // : Flag, Yes, No; 303 304 /++ 305 $(D Regex) object holds regular expression pattern in compiled form. 306 307 Instances of this object are constructed via calls to $(D regex). 308 This is an intended form for caching and storage of frequently 309 used regular expressions. 310 311 Example: 312 313 Test if this object doesn't contain any compiled pattern. 314 --- 315 Regex!char r; 316 assert(r.empty); 317 r = regex(""); // Note: "" is a valid regex pattern. 318 assert(!r.empty); 319 --- 320 321 Getting a range of all the named captures in the regex. 322 ---- 323 import std.range; 324 import std.algorithm; 325 326 auto re = regex(`(?P<name>\w+) = (?P<var>\d+)`); 327 auto nc = re.namedCaptures; 328 static assert(isRandomAccessRange!(typeof(nc))); 329 assert(!nc.empty); 330 assert(nc.length == 2); 331 assert(nc.equal(["name", "var"])); 332 assert(nc[0] == "name"); 333 assert(nc[1..$].equal(["var"])); 334 ---- 335 +/ 336 public alias Regex(Char) = std.regex.internal.ir.Regex!(Char); 337 338 /++ 339 A $(D StaticRegex) is $(D Regex) object that contains D code specially 340 generated at compile-time to speed up matching. 341 342 Implicitly convertible to normal $(D Regex), 343 however doing so will result in losing this additional capability. 344 +/ 345 public alias StaticRegex(Char) = std.regex.internal.ir.StaticRegex!(Char); 346 347 /++ 348 Compile regular expression pattern for the later execution. 349 Returns: $(D Regex) object that works on inputs having 350 the same character width as $(D pattern). 351 352 Params: 353 pattern = A single regular expression to match. 354 patterns = An array of regular expression strings. 355 The resulting `Regex` object will match any expression; 356 use $(LREF whichPattern) to know which. 357 flags = The _attributes (g, i, m and x accepted) 358 359 Throws: $(D RegexException) if there were any errors during compilation. 360 +/ 361 @trusted public auto regex(S)(S[] patterns, const(char)[] flags="") 362 if (isSomeString!(S)) 363 { 364 import std.array : appender; 365 import std.functional : memoize; 366 enum cacheSize = 8; //TODO: invent nice interface to control regex caching 367 S pat; 368 if (patterns.length > 1) 369 { 370 auto app = appender!S(); 371 foreach (i, p; patterns) 372 { 373 if (i != 0) 374 app.put("|"); 375 app.put("(?:"); 376 app.put(patterns[i]); 377 // terminator for the pattern 378 // to detect if the pattern unexpectedly ends 379 app.put("\\"); 380 app.put(cast(dchar)(privateUseStart+i)); 381 app.put(")"); 382 // another one to return correct whichPattern 383 // for all of potential alternatives in the patterns[i] 384 app.put("\\"); 385 app.put(cast(dchar)(privateUseStart+i)); 386 } 387 pat = app.data; 388 } 389 else 390 pat = patterns[0]; 391 392 if (__ctfe) 393 return regexImpl(pat, flags); 394 return memoize!(regexImpl!S, cacheSize)(pat, flags); 395 } 396 397 ///ditto 398 @trusted public auto regex(S)(S pattern, const(char)[] flags="") 399 if (isSomeString!(S)) 400 { 401 return regex([pattern], flags); 402 } 403 404 /// 405 @system unittest 406 { 407 // multi-pattern regex example 408 auto multi = regex([`([a-z]+):(\d+)`, `(\d+),\d+`]); // multi regex 409 auto m = "abc:43 12,34".matchAll(multi); 410 assert(m.front.whichPattern == 1); 411 assert(m.front[1] == "abc"); 412 assert(m.front[2] == "43"); 413 m.popFront(); 414 assert(m.front.whichPattern == 2); 415 assert(m.front[1] == "12"); 416 } 417 418 public auto regexImpl(S)(S pattern, const(char)[] flags="") 419 if (isSomeString!(S)) 420 { 421 import std.regex.internal.parser : Parser, CodeGen; 422 auto parser = Parser!(Unqual!(typeof(pattern)), CodeGen)(pattern, flags); 423 auto r = parser.program; 424 return r; 425 } 426 427 428 template ctRegexImpl(alias pattern, string flags=[]) 429 { 430 import std.regex.internal.backtracking, std.regex.internal.parser; 431 enum r = regex(pattern, flags); 432 alias Char = BasicElementOf!(typeof(pattern)); 433 enum source = ctGenRegExCode(r); 434 alias Matcher = BacktrackingMatcher!(true); 435 @trusted bool func(ref Matcher!Char matcher) 436 { 437 debug(std_regex_ctr) pragma(msg, source); 438 mixin(source); 439 } 440 enum nr = StaticRegex!Char(r, &func); 441 } 442 443 /++ 444 Compile regular expression using CTFE 445 and generate optimized native machine code for matching it. 446 447 Returns: StaticRegex object for faster matching. 448 449 Params: 450 pattern = Regular expression 451 flags = The _attributes (g, i, m and x accepted) 452 +/ 453 public enum ctRegex(alias pattern, alias flags=[]) = ctRegexImpl!(pattern, flags).nr; 454 455 enum isRegexFor(RegEx, R) = is(RegEx == Regex!(BasicElementOf!R)) 456 || is(RegEx == StaticRegex!(BasicElementOf!R)); 457 458 459 /++ 460 $(D Captures) object contains submatches captured during a call 461 to $(D match) or iteration over $(D RegexMatch) range. 462 463 First element of range is the whole match. 464 +/ 465 @trusted public struct Captures(R, DIndex = size_t) 466 if (isSomeString!R) 467 {//@trusted because of union inside 468 alias DataIndex = DIndex; 469 alias String = R; 470 private: 471 import std.conv : text; 472 R _input; 473 int _nMatch; 474 enum smallString = 3; 475 enum SMALL_MASK = 0x8000_0000, REF_MASK= 0x1FFF_FFFF; 476 union 477 { 478 Group!DataIndex[] big_matches; 479 Group!DataIndex[smallString] small_matches; 480 } 481 uint _f, _b; 482 uint _refcount; // ref count or SMALL MASK + num groups 483 NamedGroup[] _names; 484 485 this()(R input, uint n, NamedGroup[] named) 486 { 487 _input = input; 488 _names = named; 489 newMatches(n); 490 _b = n; 491 _f = 0; 492 } 493 494 this(alias Engine)(ref RegexMatch!(R,Engine) rmatch) 495 { 496 _input = rmatch._input; 497 _names = rmatch._engine.re.dict; 498 immutable n = rmatch._engine.re.ngroup; 499 newMatches(n); 500 _b = n; 501 _f = 0; 502 } 503 504 @property inout(Group!DataIndex[]) matches() inout 505 { 506 return (_refcount & SMALL_MASK) ? small_matches[0 .. _refcount & 0xFF] : big_matches; 507 } 508 509 void newMatches(uint n) 510 { 511 import core.stdc.stdlib : calloc; 512 import std.exception : enforce; 513 if (n > smallString) 514 { 515 auto p = cast(Group!DataIndex*) enforce( 516 calloc(Group!DataIndex.sizeof,n), 517 "Failed to allocate Captures struct" 518 ); 519 big_matches = p[0 .. n]; 520 _refcount = 1; 521 } 522 else 523 { 524 _refcount = SMALL_MASK | n; 525 } 526 } 527 528 bool unique() 529 { 530 return (_refcount & SMALL_MASK) || _refcount == 1; 531 } 532 533 public: 534 this(this) 535 { 536 if (!(_refcount & SMALL_MASK)) 537 { 538 _refcount++; 539 } 540 } 541 ~this() 542 { 543 import core.stdc.stdlib : free; 544 if (!(_refcount & SMALL_MASK)) 545 { 546 if (--_refcount == 0) 547 { 548 free(big_matches.ptr); 549 big_matches = null; 550 } 551 } 552 } 553 ///Slice of input prior to the match. 554 @property R pre() 555 { 556 return _nMatch == 0 ? _input[] : _input[0 .. matches[0].begin]; 557 } 558 559 ///Slice of input immediately after the match. 560 @property R post() 561 { 562 return _nMatch == 0 ? _input[] : _input[matches[0].end .. $]; 563 } 564 565 ///Slice of matched portion of input. 566 @property R hit() 567 { 568 assert(_nMatch, "attempted to get hit of an empty match"); 569 return _input[matches[0].begin .. matches[0].end]; 570 } 571 572 ///Range interface. 573 @property R front() 574 { 575 assert(_nMatch, "attempted to get front of an empty match"); 576 return _input[matches[_f].begin .. matches[_f].end]; 577 } 578 579 ///ditto 580 @property R back() 581 { 582 assert(_nMatch, "attempted to get back of an empty match"); 583 return _input[matches[_b - 1].begin .. matches[_b - 1].end]; 584 } 585 586 ///ditto 587 void popFront() 588 { 589 assert(!empty); 590 ++_f; 591 } 592 593 ///ditto 594 void popBack() 595 { 596 assert(!empty); 597 --_b; 598 } 599 600 ///ditto 601 @property bool empty() const { return _nMatch == 0 || _f >= _b; } 602 603 ///ditto 604 inout(R) opIndex()(size_t i) inout 605 { 606 assert(_f + i < _b,text("requested submatch number ", i," is out of range")); 607 assert(matches[_f + i].begin <= matches[_f + i].end, 608 text("wrong match: ", matches[_f + i].begin, "..", matches[_f + i].end)); 609 return _input[matches[_f + i].begin .. matches[_f + i].end]; 610 } 611 612 /++ 613 Explicit cast to bool. 614 Useful as a shorthand for !(x.empty) in if and assert statements. 615 616 --- 617 import std.regex; 618 619 assert(!matchFirst("nothing", "something")); 620 --- 621 +/ 622 623 @safe bool opCast(T:bool)() const nothrow { return _nMatch != 0; } 624 625 /++ 626 Number of pattern matched counting, where 1 - the first pattern. 627 Returns 0 on no match. 628 +/ 629 630 @safe @property int whichPattern() const nothrow { return _nMatch; } 631 632 /// 633 @system unittest 634 { 635 import std.regex; 636 assert(matchFirst("abc", "[0-9]+", "[a-z]+").whichPattern == 2); 637 } 638 639 /++ 640 Lookup named submatch. 641 642 --- 643 import std.regex; 644 import std.range; 645 646 auto c = matchFirst("a = 42;", regex(`(?P<var>\w+)\s*=\s*(?P<value>\d+);`)); 647 assert(c["var"] == "a"); 648 assert(c["value"] == "42"); 649 popFrontN(c, 2); 650 //named groups are unaffected by range primitives 651 assert(c["var"] =="a"); 652 assert(c.front == "42"); 653 ---- 654 +/ 655 R opIndex(String)(String i) /*const*/ //@@@BUG@@@ 656 if (isSomeString!String) 657 { 658 size_t index = lookupNamedGroup(_names, i); 659 return _input[matches[index].begin .. matches[index].end]; 660 } 661 662 ///Number of matches in this object. 663 @property size_t length() const { return _nMatch == 0 ? 0 : _b - _f; } 664 665 ///A hook for compatibility with original std.regex. 666 @property ref captures(){ return this; } 667 } 668 669 /// 670 @system unittest 671 { 672 import std.range.primitives : popFrontN; 673 674 auto c = matchFirst("@abc#", regex(`(\w)(\w)(\w)`)); 675 assert(c.pre == "@"); // Part of input preceding match 676 assert(c.post == "#"); // Immediately after match 677 assert(c.hit == c[0] && c.hit == "abc"); // The whole match 678 assert(c[2] == "b"); 679 assert(c.front == "abc"); 680 c.popFront(); 681 assert(c.front == "a"); 682 assert(c.back == "c"); 683 c.popBack(); 684 assert(c.back == "b"); 685 popFrontN(c, 2); 686 assert(c.empty); 687 688 assert(!matchFirst("nothing", "something")); 689 } 690 691 /++ 692 A regex engine state, as returned by $(D match) family of functions. 693 694 Effectively it's a forward range of Captures!R, produced 695 by lazily searching for matches in a given input. 696 697 $(D alias Engine) specifies an engine type to use during matching, 698 and is automatically deduced in a call to $(D match)/$(D bmatch). 699 +/ 700 @trusted public struct RegexMatch(R, alias Engine = ThompsonMatcher) 701 if (isSomeString!R) 702 { 703 private: 704 import core.stdc.stdlib : malloc, free; 705 alias Char = BasicElementOf!R; 706 alias EngineType = Engine!Char; 707 EngineType _engine; 708 R _input; 709 Captures!(R,EngineType.DataIndex) _captures; 710 void[] _memory;//is ref-counted 711 712 this(RegEx)(R input, RegEx prog) 713 { 714 import std.exception : enforce; 715 _input = input; 716 immutable size = EngineType.initialMemory(prog)+size_t.sizeof; 717 _memory = (enforce(malloc(size), "malloc failed")[0 .. size]); 718 scope(failure) free(_memory.ptr); 719 *cast(size_t*)_memory.ptr = 1; 720 _engine = EngineType(prog, Input!Char(input), _memory[size_t.sizeof..$]); 721 static if (is(RegEx == StaticRegex!(BasicElementOf!R))) 722 _engine.nativeFn = prog.nativeFn; 723 _captures = Captures!(R,EngineType.DataIndex)(this); 724 _captures._nMatch = _engine.match(_captures.matches); 725 debug(std_regex_allocation) writefln("RefCount (ctor): %x %d", _memory.ptr, counter); 726 } 727 728 @property ref size_t counter(){ return *cast(size_t*)_memory.ptr; } 729 public: 730 this(this) 731 { 732 if (_memory.ptr) 733 { 734 ++counter; 735 debug(std_regex_allocation) writefln("RefCount (postblit): %x %d", 736 _memory.ptr, *cast(size_t*)_memory.ptr); 737 } 738 } 739 740 ~this() 741 { 742 if (_memory.ptr && --*cast(size_t*)_memory.ptr == 0) 743 { 744 debug(std_regex_allocation) writefln("RefCount (dtor): %x %d", 745 _memory.ptr, *cast(size_t*)_memory.ptr); 746 free(cast(void*)_memory.ptr); 747 } 748 } 749 750 ///Shorthands for front.pre, front.post, front.hit. 751 @property R pre() 752 { 753 return _captures.pre; 754 } 755 756 ///ditto 757 @property R post() 758 { 759 return _captures.post; 760 } 761 762 ///ditto 763 @property R hit() 764 { 765 return _captures.hit; 766 } 767 768 /++ 769 Functionality for processing subsequent matches of global regexes via range interface: 770 --- 771 import std.regex; 772 auto m = matchAll("Hello, world!", regex(`\w+`)); 773 assert(m.front.hit == "Hello"); 774 m.popFront(); 775 assert(m.front.hit == "world"); 776 m.popFront(); 777 assert(m.empty); 778 --- 779 +/ 780 @property auto front() 781 { 782 return _captures; 783 } 784 785 ///ditto 786 void popFront() 787 { 788 import std.exception : enforce; 789 if (counter != 1) 790 {//do cow magic first 791 counter--;//we abandon this reference 792 immutable size = EngineType.initialMemory(_engine.re)+size_t.sizeof; 793 _memory = (enforce(malloc(size), "malloc failed")[0 .. size]); 794 _engine = _engine.dupTo(_memory[size_t.sizeof .. size]); 795 counter = 1;//points to new chunk 796 } 797 798 if (!_captures.unique) 799 { 800 // has external references - allocate new space 801 _captures.newMatches(_engine.re.ngroup); 802 } 803 _captures._nMatch = _engine.match(_captures.matches); 804 } 805 806 ///ditto 807 auto save(){ return this; } 808 809 ///Test if this match object is empty. 810 @property bool empty() const { return _captures._nMatch == 0; } 811 812 ///Same as !(x.empty), provided for its convenience in conditional statements. 813 T opCast(T:bool)(){ return !empty; } 814 815 /// Same as .front, provided for compatibility with original std.regex. 816 @property auto captures() inout { return _captures; } 817 818 } 819 820 private @trusted auto matchOnce(alias Engine, RegEx, R)(R input, RegEx re) 821 { 822 import core.stdc.stdlib : malloc, free; 823 import std.exception : enforce; 824 alias Char = BasicElementOf!R; 825 alias EngineType = Engine!Char; 826 827 size_t size = EngineType.initialMemory(re); 828 void[] memory = enforce(malloc(size), "malloc failed")[0 .. size]; 829 scope(exit) free(memory.ptr); 830 auto captures = Captures!(R, EngineType.DataIndex)(input, re.ngroup, re.dict); 831 auto engine = EngineType(re, Input!Char(input), memory); 832 static if (is(RegEx == StaticRegex!(BasicElementOf!R))) 833 engine.nativeFn = re.nativeFn; 834 captures._nMatch = engine.match(captures.matches); 835 return captures; 836 } 837 838 private auto matchMany(alias Engine, RegEx, R)(R input, RegEx re) 839 { 840 re.flags |= RegexOption.global; 841 return RegexMatch!(R, Engine)(input, re); 842 } 843 844 @system unittest 845 { 846 //sanity checks for new API 847 auto re = regex("abc"); 848 assert(!"abc".matchOnce!(ThompsonMatcher)(re).empty); 849 assert("abc".matchOnce!(ThompsonMatcher)(re)[0] == "abc"); 850 } 851 852 853 private enum isReplaceFunctor(alias fun, R) = 854 __traits(compiles, (Captures!R c) { fun(c); }); 855 856 // the lowest level - just stuff replacements into the sink 857 private @trusted void replaceCapturesInto(alias output, Sink, R, T) 858 (ref Sink sink, R input, T captures) 859 if (isOutputRange!(Sink, dchar) && isSomeString!R) 860 { 861 if (captures.empty) 862 { 863 sink.put(input); 864 return; 865 } 866 sink.put(captures.pre); 867 // a hack to get around bogus errors, should be simply output(captures, sink) 868 // "is a nested function and cannot be accessed from" 869 static if (isReplaceFunctor!(output, R)) 870 sink.put(output(captures)); //"mutator" type of function 871 else 872 output(captures, sink); //"output" type of function 873 sink.put(captures.post); 874 } 875 876 // ditto for a range of captures 877 private void replaceMatchesInto(alias output, Sink, R, T) 878 (ref Sink sink, R input, T matches) 879 if (isOutputRange!(Sink, dchar) && isSomeString!R) 880 { 881 size_t offset = 0; 882 foreach (cap; matches) 883 { 884 sink.put(cap.pre[offset .. $]); 885 // same hack, see replaceCapturesInto 886 static if (isReplaceFunctor!(output, R)) 887 sink.put(output(cap)); //"mutator" type of function 888 else 889 output(cap, sink); //"output" type of function 890 offset = cap.pre.length + cap.hit.length; 891 } 892 sink.put(input[offset .. $]); 893 } 894 895 // a general skeleton of replaceFirst 896 private R replaceFirstWith(alias output, R, RegEx)(R input, RegEx re) 897 if (isSomeString!R && isRegexFor!(RegEx, R)) 898 { 899 import std.array : appender; 900 auto data = matchFirst(input, re); 901 if (data.empty) 902 return input; 903 auto app = appender!(R)(); 904 replaceCapturesInto!output(app, input, data); 905 return app.data; 906 } 907 908 // ditto for replaceAll 909 // the method parameter allows old API to ride on the back of the new one 910 private R replaceAllWith(alias output, 911 alias method=matchAll, R, RegEx)(R input, RegEx re) 912 if (isSomeString!R && isRegexFor!(RegEx, R)) 913 { 914 import std.array : appender; 915 auto matches = method(input, re); //inout(C)[] fails 916 if (matches.empty) 917 return input; 918 auto app = appender!(R)(); 919 replaceMatchesInto!output(app, input, matches); 920 return app.data; 921 } 922 923 924 /++ 925 Start matching $(D input) to regex pattern $(D re), 926 using Thompson NFA matching scheme. 927 928 The use of this function is $(RED discouraged) - use either of 929 $(LREF matchAll) or $(LREF matchFirst). 930 931 Delegating the kind of operation 932 to "g" flag is soon to be phased out along with the 933 ability to choose the exact matching scheme. The choice of 934 matching scheme to use depends highly on the pattern kind and 935 can done automatically on case by case basis. 936 937 Returns: a $(D RegexMatch) object holding engine state after first match. 938 +/ 939 940 public auto match(R, RegEx)(R input, RegEx re) 941 if (isSomeString!R && is(RegEx == Regex!(BasicElementOf!R))) 942 { 943 import std.regex.internal.thompson : ThompsonMatcher; 944 return RegexMatch!(Unqual!(typeof(input)),ThompsonMatcher)(input, re); 945 } 946 947 ///ditto 948 public auto match(R, String)(R input, String re) 949 if (isSomeString!R && isSomeString!String) 950 { 951 import std.regex.internal.thompson : ThompsonMatcher; 952 return RegexMatch!(Unqual!(typeof(input)),ThompsonMatcher)(input, regex(re)); 953 } 954 955 public auto match(R, RegEx)(R input, RegEx re) 956 if (isSomeString!R && is(RegEx == StaticRegex!(BasicElementOf!R))) 957 { 958 import std.regex.internal.backtracking : BacktrackingMatcher; 959 return RegexMatch!(Unqual!(typeof(input)),BacktrackingMatcher!true)(input, re); 960 } 961 962 /++ 963 Find the first (leftmost) slice of the $(D input) that 964 matches the pattern $(D re). This function picks the most suitable 965 regular expression engine depending on the pattern properties. 966 967 $(D re) parameter can be one of three types: 968 $(UL 969 $(LI Plain string(s), in which case it's compiled to bytecode before matching. ) 970 $(LI Regex!char (wchar/dchar) that contains a pattern in the form of 971 compiled bytecode. ) 972 $(LI StaticRegex!char (wchar/dchar) that contains a pattern in the form of 973 compiled native machine code. ) 974 ) 975 976 Returns: 977 $(LREF Captures) containing the extent of a match together with all submatches 978 if there was a match, otherwise an empty $(LREF Captures) object. 979 +/ 980 public auto matchFirst(R, RegEx)(R input, RegEx re) 981 if (isSomeString!R && is(RegEx == Regex!(BasicElementOf!R))) 982 { 983 import std.regex.internal.thompson : ThompsonMatcher; 984 return matchOnce!ThompsonMatcher(input, re); 985 } 986 987 ///ditto 988 public auto matchFirst(R, String)(R input, String re) 989 if (isSomeString!R && isSomeString!String) 990 { 991 import std.regex.internal.thompson : ThompsonMatcher; 992 return matchOnce!ThompsonMatcher(input, regex(re)); 993 } 994 995 ///ditto 996 public auto matchFirst(R, String)(R input, String[] re...) 997 if (isSomeString!R && isSomeString!String) 998 { 999 import std.regex.internal.thompson : ThompsonMatcher; 1000 return matchOnce!ThompsonMatcher(input, regex(re)); 1001 } 1002 1003 public auto matchFirst(R, RegEx)(R input, RegEx re) 1004 if (isSomeString!R && is(RegEx == StaticRegex!(BasicElementOf!R))) 1005 { 1006 import std.regex.internal.backtracking : BacktrackingMatcher; 1007 return matchOnce!(BacktrackingMatcher!true)(input, re); 1008 } 1009 1010 /++ 1011 Initiate a search for all non-overlapping matches to the pattern $(D re) 1012 in the given $(D input). The result is a lazy range of matches generated 1013 as they are encountered in the input going left to right. 1014 1015 This function picks the most suitable regular expression engine 1016 depending on the pattern properties. 1017 1018 $(D re) parameter can be one of three types: 1019 $(UL 1020 $(LI Plain string(s), in which case it's compiled to bytecode before matching. ) 1021 $(LI Regex!char (wchar/dchar) that contains a pattern in the form of 1022 compiled bytecode. ) 1023 $(LI StaticRegex!char (wchar/dchar) that contains a pattern in the form of 1024 compiled native machine code. ) 1025 ) 1026 1027 Returns: 1028 $(LREF RegexMatch) object that represents matcher state 1029 after the first match was found or an empty one if not present. 1030 +/ 1031 public auto matchAll(R, RegEx)(R input, RegEx re) 1032 if (isSomeString!R && is(RegEx == Regex!(BasicElementOf!R))) 1033 { 1034 import std.regex.internal.thompson : ThompsonMatcher; 1035 return matchMany!ThompsonMatcher(input, re); 1036 } 1037 1038 ///ditto 1039 public auto matchAll(R, String)(R input, String re) 1040 if (isSomeString!R && isSomeString!String) 1041 { 1042 import std.regex.internal.thompson : ThompsonMatcher; 1043 return matchMany!ThompsonMatcher(input, regex(re)); 1044 } 1045 1046 ///ditto 1047 public auto matchAll(R, String)(R input, String[] re...) 1048 if (isSomeString!R && isSomeString!String) 1049 { 1050 import std.regex.internal.thompson : ThompsonMatcher; 1051 return matchMany!ThompsonMatcher(input, regex(re)); 1052 } 1053 1054 public auto matchAll(R, RegEx)(R input, RegEx re) 1055 if (isSomeString!R && is(RegEx == StaticRegex!(BasicElementOf!R))) 1056 { 1057 import std.regex.internal.backtracking : BacktrackingMatcher; 1058 return matchMany!(BacktrackingMatcher!true)(input, re); 1059 } 1060 1061 // another set of tests just to cover the new API 1062 @system unittest 1063 { 1064 import std.algorithm.comparison : equal; 1065 import std.algorithm.iteration : map; 1066 import std.conv : to; 1067 1068 foreach (String; AliasSeq!(string, wstring, const(dchar)[])) 1069 { 1070 auto str1 = "blah-bleh".to!String(); 1071 auto pat1 = "bl[ae]h".to!String(); 1072 auto mf = matchFirst(str1, pat1); 1073 assert(mf.equal(["blah".to!String()])); 1074 auto mAll = matchAll(str1, pat1); 1075 assert(mAll.equal!((a,b) => a.equal(b)) 1076 ([["blah".to!String()], ["bleh".to!String()]])); 1077 1078 auto str2 = "1/03/12 - 3/03/12".to!String(); 1079 auto pat2 = regex([r"(\d+)/(\d+)/(\d+)".to!String(), "abc".to!String]); 1080 auto mf2 = matchFirst(str2, pat2); 1081 assert(mf2.equal(["1/03/12", "1", "03", "12"].map!(to!String)())); 1082 auto mAll2 = matchAll(str2, pat2); 1083 assert(mAll2.front.equal(mf2)); 1084 mAll2.popFront(); 1085 assert(mAll2.front.equal(["3/03/12", "3", "03", "12"].map!(to!String)())); 1086 mf2.popFrontN(3); 1087 assert(mf2.equal(["12".to!String()])); 1088 1089 auto ctPat = ctRegex!(`(?P<Quot>\d+)/(?P<Denom>\d+)`.to!String()); 1090 auto str = "2 + 34/56 - 6/1".to!String(); 1091 auto cmf = matchFirst(str, ctPat); 1092 assert(cmf.equal(["34/56", "34", "56"].map!(to!String)())); 1093 assert(cmf["Quot"] == "34".to!String()); 1094 assert(cmf["Denom"] == "56".to!String()); 1095 1096 auto cmAll = matchAll(str, ctPat); 1097 assert(cmAll.front.equal(cmf)); 1098 cmAll.popFront(); 1099 assert(cmAll.front.equal(["6/1", "6", "1"].map!(to!String)())); 1100 } 1101 } 1102 1103 /++ 1104 Start matching of $(D input) to regex pattern $(D re), 1105 using traditional $(LINK2 https://en.wikipedia.org/wiki/Backtracking, 1106 backtracking) matching scheme. 1107 1108 The use of this function is $(RED discouraged) - use either of 1109 $(LREF matchAll) or $(LREF matchFirst). 1110 1111 Delegating the kind of operation 1112 to "g" flag is soon to be phased out along with the 1113 ability to choose the exact matching scheme. The choice of 1114 matching scheme to use depends highly on the pattern kind and 1115 can done automatically on case by case basis. 1116 1117 Returns: a $(D RegexMatch) object holding engine 1118 state after first match. 1119 1120 +/ 1121 public auto bmatch(R, RegEx)(R input, RegEx re) 1122 if (isSomeString!R && is(RegEx == Regex!(BasicElementOf!R))) 1123 { 1124 import std.regex.internal.backtracking : BacktrackingMatcher; 1125 return RegexMatch!(Unqual!(typeof(input)), BacktrackingMatcher!false)(input, re); 1126 } 1127 1128 ///ditto 1129 public auto bmatch(R, String)(R input, String re) 1130 if (isSomeString!R && isSomeString!String) 1131 { 1132 import std.regex.internal.backtracking : BacktrackingMatcher; 1133 return RegexMatch!(Unqual!(typeof(input)), BacktrackingMatcher!false)(input, regex(re)); 1134 } 1135 1136 public auto bmatch(R, RegEx)(R input, RegEx re) 1137 if (isSomeString!R && is(RegEx == StaticRegex!(BasicElementOf!R))) 1138 { 1139 import std.regex.internal.backtracking : BacktrackingMatcher; 1140 return RegexMatch!(Unqual!(typeof(input)),BacktrackingMatcher!true)(input, re); 1141 } 1142 1143 // produces replacement string from format using captures for substitution 1144 package void replaceFmt(R, Capt, OutR) 1145 (R format, Capt captures, OutR sink, bool ignoreBadSubs = false) 1146 if (isOutputRange!(OutR, ElementEncodingType!R[]) && 1147 isOutputRange!(OutR, ElementEncodingType!(Capt.String)[])) 1148 { 1149 import std.algorithm.searching : find; 1150 import std.ascii : isDigit, isAlpha; 1151 import std.conv : text, parse; 1152 import std.exception : enforce; 1153 enum State { Normal, Dollar } 1154 auto state = State.Normal; 1155 size_t offset; 1156 L_Replace_Loop: 1157 while (!format.empty) 1158 final switch (state) 1159 { 1160 case State.Normal: 1161 for (offset = 0; offset < format.length; offset++)//no decoding 1162 { 1163 if (format[offset] == '$') 1164 { 1165 state = State.Dollar; 1166 sink.put(format[0 .. offset]); 1167 format = format[offset+1 .. $];//ditto 1168 continue L_Replace_Loop; 1169 } 1170 } 1171 sink.put(format[0 .. offset]); 1172 format = format[offset .. $]; 1173 break; 1174 case State.Dollar: 1175 if (isDigit(format[0])) 1176 { 1177 uint digit = parse!uint(format); 1178 enforce(ignoreBadSubs || digit < captures.length, text("invalid submatch number ", digit)); 1179 if (digit < captures.length) 1180 sink.put(captures[digit]); 1181 } 1182 else if (format[0] == '{') 1183 { 1184 auto x = find!(a => !isAlpha(a))(format[1..$]); 1185 enforce(!x.empty && x[0] == '}', "no matching '}' in replacement format"); 1186 auto name = format[1 .. $ - x.length]; 1187 format = x[1..$]; 1188 enforce(!name.empty, "invalid name in ${...} replacement format"); 1189 sink.put(captures[name]); 1190 } 1191 else if (format[0] == '&') 1192 { 1193 sink.put(captures[0]); 1194 format = format[1 .. $]; 1195 } 1196 else if (format[0] == '`') 1197 { 1198 sink.put(captures.pre); 1199 format = format[1 .. $]; 1200 } 1201 else if (format[0] == '\'') 1202 { 1203 sink.put(captures.post); 1204 format = format[1 .. $]; 1205 } 1206 else if (format[0] == '$') 1207 { 1208 sink.put(format[0 .. 1]); 1209 format = format[1 .. $]; 1210 } 1211 state = State.Normal; 1212 break; 1213 } 1214 enforce(state == State.Normal, "invalid format string in regex replace"); 1215 } 1216 1217 /++ 1218 Construct a new string from $(D input) by replacing the first match with 1219 a string generated from it according to the $(D format) specifier. 1220 1221 To replace all matches use $(LREF replaceAll). 1222 1223 Params: 1224 input = string to search 1225 re = compiled regular expression to use 1226 format = _format string to generate replacements from, 1227 see $(S_LINK Replace _format string, the _format string). 1228 1229 Returns: 1230 A string of the same type with the first match (if any) replaced. 1231 If no match is found returns the input string itself. 1232 +/ 1233 public R replaceFirst(R, C, RegEx)(R input, RegEx re, const(C)[] format) 1234 if (isSomeString!R && is(C : dchar) && isRegexFor!(RegEx, R)) 1235 { 1236 return replaceFirstWith!((m, sink) => replaceFmt(format, m, sink))(input, re); 1237 } 1238 1239 /// 1240 @system unittest 1241 { 1242 assert(replaceFirst("noon", regex("n"), "[$&]") == "[n]oon"); 1243 } 1244 1245 /++ 1246 This is a general replacement tool that construct a new string by replacing 1247 matches of pattern $(D re) in the $(D input). Unlike the other overload 1248 there is no format string instead captures are passed to 1249 to a user-defined functor $(D fun) that returns a new string 1250 to use as replacement. 1251 1252 This version replaces the first match in $(D input), 1253 see $(LREF replaceAll) to replace the all of the matches. 1254 1255 Returns: 1256 A new string of the same type as $(D input) with all matches 1257 replaced by return values of $(D fun). If no matches found 1258 returns the $(D input) itself. 1259 +/ 1260 public R replaceFirst(alias fun, R, RegEx)(R input, RegEx re) 1261 if (isSomeString!R && isRegexFor!(RegEx, R)) 1262 { 1263 return replaceFirstWith!((m, sink) => sink.put(fun(m)))(input, re); 1264 } 1265 1266 /// 1267 @system unittest 1268 { 1269 import std.conv : to; 1270 string list = "#21 out of 46"; 1271 string newList = replaceFirst!(cap => to!string(to!int(cap.hit)+1)) 1272 (list, regex(`[0-9]+`)); 1273 assert(newList == "#22 out of 46"); 1274 } 1275 1276 /++ 1277 A variation on $(LREF replaceFirst) that instead of allocating a new string 1278 on each call outputs the result piece-wise to the $(D sink). In particular 1279 this enables efficient construction of a final output incrementally. 1280 1281 Like in $(LREF replaceFirst) family of functions there is an overload 1282 for the substitution guided by the $(D format) string 1283 and the one with the user defined callback. 1284 +/ 1285 public @trusted void replaceFirstInto(Sink, R, C, RegEx) 1286 (ref Sink sink, R input, RegEx re, const(C)[] format) 1287 if (isOutputRange!(Sink, dchar) && isSomeString!R 1288 && is(C : dchar) && isRegexFor!(RegEx, R)) 1289 { 1290 replaceCapturesInto!((m, sink) => replaceFmt(format, m, sink)) 1291 (sink, input, matchFirst(input, re)); 1292 } 1293 1294 ///ditto 1295 public @trusted void replaceFirstInto(alias fun, Sink, R, RegEx) 1296 (Sink sink, R input, RegEx re) 1297 if (isOutputRange!(Sink, dchar) && isSomeString!R && isRegexFor!(RegEx, R)) 1298 { 1299 replaceCapturesInto!fun(sink, input, matchFirst(input, re)); 1300 } 1301 1302 /// 1303 @system unittest 1304 { 1305 import std.array; 1306 string m1 = "first message\n"; 1307 string m2 = "second message\n"; 1308 auto result = appender!string(); 1309 replaceFirstInto(result, m1, regex(`([a-z]+) message`), "$1"); 1310 //equivalent of the above with user-defined callback 1311 replaceFirstInto!(cap=>cap[1])(result, m2, regex(`([a-z]+) message`)); 1312 assert(result.data == "first\nsecond\n"); 1313 } 1314 1315 //examples for replaceFirst 1316 @system unittest 1317 { 1318 import std.conv; 1319 string list = "#21 out of 46"; 1320 string newList = replaceFirst!(cap => to!string(to!int(cap.hit)+1)) 1321 (list, regex(`[0-9]+`)); 1322 assert(newList == "#22 out of 46"); 1323 import std.array; 1324 string m1 = "first message\n"; 1325 string m2 = "second message\n"; 1326 auto result = appender!string(); 1327 replaceFirstInto(result, m1, regex(`([a-z]+) message`), "$1"); 1328 //equivalent of the above with user-defined callback 1329 replaceFirstInto!(cap=>cap[1])(result, m2, regex(`([a-z]+) message`)); 1330 assert(result.data == "first\nsecond\n"); 1331 } 1332 1333 /++ 1334 Construct a new string from $(D input) by replacing all of the 1335 fragments that match a pattern $(D re) with a string generated 1336 from the match according to the $(D format) specifier. 1337 1338 To replace only the first match use $(LREF replaceFirst). 1339 1340 Params: 1341 input = string to search 1342 re = compiled regular expression to use 1343 format = _format string to generate replacements from, 1344 see $(S_LINK Replace _format string, the _format string). 1345 1346 Returns: 1347 A string of the same type as $(D input) with the all 1348 of the matches (if any) replaced. 1349 If no match is found returns the input string itself. 1350 +/ 1351 public @trusted R replaceAll(R, C, RegEx)(R input, RegEx re, const(C)[] format) 1352 if (isSomeString!R && is(C : dchar) && isRegexFor!(RegEx, R)) 1353 { 1354 return replaceAllWith!((m, sink) => replaceFmt(format, m, sink))(input, re); 1355 } 1356 1357 /// 1358 @system unittest 1359 { 1360 // insert comma as thousands delimiter 1361 auto re = regex(r"(?<=\d)(?=(\d\d\d)+\b)","g"); 1362 assert(replaceAll("12000 + 42100 = 54100", re, ",") == "12,000 + 42,100 = 54,100"); 1363 } 1364 1365 /++ 1366 This is a general replacement tool that construct a new string by replacing 1367 matches of pattern $(D re) in the $(D input). Unlike the other overload 1368 there is no format string instead captures are passed to 1369 to a user-defined functor $(D fun) that returns a new string 1370 to use as replacement. 1371 1372 This version replaces all of the matches found in $(D input), 1373 see $(LREF replaceFirst) to replace the first match only. 1374 1375 Returns: 1376 A new string of the same type as $(D input) with all matches 1377 replaced by return values of $(D fun). If no matches found 1378 returns the $(D input) itself. 1379 1380 Params: 1381 input = string to search 1382 re = compiled regular expression 1383 fun = delegate to use 1384 +/ 1385 public @trusted R replaceAll(alias fun, R, RegEx)(R input, RegEx re) 1386 if (isSomeString!R && isRegexFor!(RegEx, R)) 1387 { 1388 return replaceAllWith!((m, sink) => sink.put(fun(m)))(input, re); 1389 } 1390 1391 /// 1392 @system unittest 1393 { 1394 string baz(Captures!(string) m) 1395 { 1396 import std.string : toUpper; 1397 return toUpper(m.hit); 1398 } 1399 // Capitalize the letters 'a' and 'r': 1400 auto s = replaceAll!(baz)("Strap a rocket engine on a chicken.", 1401 regex("[ar]")); 1402 assert(s == "StRAp A Rocket engine on A chicken."); 1403 } 1404 1405 /++ 1406 A variation on $(LREF replaceAll) that instead of allocating a new string 1407 on each call outputs the result piece-wise to the $(D sink). In particular 1408 this enables efficient construction of a final output incrementally. 1409 1410 As with $(LREF replaceAll) there are 2 overloads - one with a format string, 1411 the other one with a user defined functor. 1412 +/ 1413 public @trusted void replaceAllInto(Sink, R, C, RegEx) 1414 (Sink sink, R input, RegEx re, const(C)[] format) 1415 if (isOutputRange!(Sink, dchar) && isSomeString!R 1416 && is(C : dchar) && isRegexFor!(RegEx, R)) 1417 { 1418 replaceMatchesInto!((m, sink) => replaceFmt(format, m, sink)) 1419 (sink, input, matchAll(input, re)); 1420 } 1421 1422 ///ditto 1423 public @trusted void replaceAllInto(alias fun, Sink, R, RegEx) 1424 (Sink sink, R input, RegEx re) 1425 if (isOutputRange!(Sink, dchar) && isSomeString!R && isRegexFor!(RegEx, R)) 1426 { 1427 replaceMatchesInto!fun(sink, input, matchAll(input, re)); 1428 } 1429 1430 /// 1431 @system unittest 1432 { 1433 // insert comma as thousands delimiter in fifty randomly produced big numbers 1434 import std.array, std.conv, std.random, std.range; 1435 static re = regex(`(?<=\d)(?=(\d\d\d)+\b)`, "g"); 1436 auto sink = appender!(char [])(); 1437 enum ulong min = 10UL ^^ 10, max = 10UL ^^ 19; 1438 foreach (i; 0 .. 50) 1439 { 1440 sink.clear(); 1441 replaceAllInto(sink, text(uniform(min, max)), re, ","); 1442 foreach (pos; iota(sink.data.length - 4, 0, -4)) 1443 assert(sink.data[pos] == ','); 1444 } 1445 } 1446 1447 // exercise all of the replace APIs 1448 @system unittest 1449 { 1450 import std.array : appender; 1451 import std.conv; 1452 // try and check first/all simple substitution 1453 foreach (S; AliasSeq!(string, wstring, dstring, char[], wchar[], dchar[])) 1454 { 1455 S s1 = "curt trial".to!S(); 1456 S s2 = "round dome".to!S(); 1457 S t1F = "court trial".to!S(); 1458 S t2F = "hound dome".to!S(); 1459 S t1A = "court trial".to!S(); 1460 S t2A = "hound home".to!S(); 1461 auto re1 = regex("curt".to!S()); 1462 auto re2 = regex("[dr]o".to!S()); 1463 1464 assert(replaceFirst(s1, re1, "court") == t1F); 1465 assert(replaceFirst(s2, re2, "ho") == t2F); 1466 assert(replaceAll(s1, re1, "court") == t1A); 1467 assert(replaceAll(s2, re2, "ho") == t2A); 1468 1469 auto rep1 = replaceFirst!(cap => cap[0][0]~"o".to!S()~cap[0][1..$])(s1, re1); 1470 assert(rep1 == t1F); 1471 assert(replaceFirst!(cap => "ho".to!S())(s2, re2) == t2F); 1472 auto rep1A = replaceAll!(cap => cap[0][0]~"o".to!S()~cap[0][1..$])(s1, re1); 1473 assert(rep1A == t1A); 1474 assert(replaceAll!(cap => "ho".to!S())(s2, re2) == t2A); 1475 1476 auto sink = appender!S(); 1477 replaceFirstInto(sink, s1, re1, "court"); 1478 assert(sink.data == t1F); 1479 replaceFirstInto(sink, s2, re2, "ho"); 1480 assert(sink.data == t1F~t2F); 1481 replaceAllInto(sink, s1, re1, "court"); 1482 assert(sink.data == t1F~t2F~t1A); 1483 replaceAllInto(sink, s2, re2, "ho"); 1484 assert(sink.data == t1F~t2F~t1A~t2A); 1485 } 1486 } 1487 1488 /++ 1489 Old API for replacement, operation depends on flags of pattern $(D re). 1490 With "g" flag it performs the equivalent of $(LREF replaceAll) otherwise it 1491 works the same as $(LREF replaceFirst). 1492 1493 The use of this function is $(RED discouraged), please use $(LREF replaceAll) 1494 or $(LREF replaceFirst) explicitly. 1495 +/ 1496 public R replace(alias scheme = match, R, C, RegEx)(R input, RegEx re, const(C)[] format) 1497 if (isSomeString!R && isRegexFor!(RegEx, R)) 1498 { 1499 return replaceAllWith!((m, sink) => replaceFmt(format, m, sink), match)(input, re); 1500 } 1501 1502 ///ditto 1503 public R replace(alias fun, R, RegEx)(R input, RegEx re) 1504 if (isSomeString!R && isRegexFor!(RegEx, R)) 1505 { 1506 return replaceAllWith!(fun, match)(input, re); 1507 } 1508 1509 /** 1510 Splits a string `r` using a regular expression `pat` as a separator. 1511 1512 Params: 1513 keepSeparators = flag to specify if the matches should be in the resulting range 1514 r = the string to split 1515 pat = the pattern to split on 1516 Returns: 1517 A lazy range of strings 1518 */ 1519 public struct Splitter(Flag!"keepSeparators" keepSeparators = No.keepSeparators, Range, alias RegEx = Regex) 1520 if (isSomeString!Range && isRegexFor!(RegEx, Range)) 1521 { 1522 private: 1523 Range _input; 1524 size_t _offset; 1525 alias Rx = typeof(match(Range.init,RegEx.init)); 1526 Rx _match; 1527 1528 static if (keepSeparators) bool onMatch = false; 1529 1530 @trusted this(Range input, RegEx separator) 1531 {//@@@BUG@@@ generated opAssign of RegexMatch is not @trusted 1532 _input = input; 1533 separator.flags |= RegexOption.global; 1534 if (_input.empty) 1535 { 1536 //there is nothing to match at all, make _offset > 0 1537 _offset = 1; 1538 } 1539 else 1540 { 1541 _match = Rx(_input, separator); 1542 1543 static if (keepSeparators) 1544 if (_match.pre.empty) 1545 popFront(); 1546 } 1547 } 1548 1549 public: 1550 auto ref opSlice() 1551 { 1552 return this.save; 1553 } 1554 1555 ///Forward range primitives. 1556 @property Range front() 1557 { 1558 import std.algorithm.comparison : min; 1559 1560 assert(!empty && _offset <= _match.pre.length 1561 && _match.pre.length <= _input.length); 1562 1563 static if (keepSeparators) 1564 { 1565 if (!onMatch) 1566 return _input[_offset .. min($, _match.pre.length)]; 1567 else 1568 return _match.hit(); 1569 } 1570 else 1571 { 1572 return _input[_offset .. min($, _match.pre.length)]; 1573 } 1574 } 1575 1576 ///ditto 1577 @property bool empty() 1578 { 1579 static if (keepSeparators) 1580 return _offset >= _input.length; 1581 else 1582 return _offset > _input.length; 1583 } 1584 1585 ///ditto 1586 void popFront() 1587 { 1588 assert(!empty); 1589 if (_match.empty) 1590 { 1591 //No more separators, work is done here 1592 _offset = _input.length + 1; 1593 } 1594 else 1595 { 1596 static if (keepSeparators) 1597 { 1598 if (!onMatch) 1599 { 1600 //skip past the separator 1601 _offset = _match.pre.length; 1602 } 1603 else 1604 { 1605 _offset += _match.hit.length; 1606 _match.popFront(); 1607 } 1608 1609 onMatch = !onMatch; 1610 } 1611 else 1612 { 1613 //skip past the separator 1614 _offset = _match.pre.length + _match.hit.length; 1615 _match.popFront(); 1616 } 1617 } 1618 } 1619 1620 ///ditto 1621 @property auto save() 1622 { 1623 return this; 1624 } 1625 } 1626 1627 /// ditto 1628 public Splitter!(keepSeparators, Range, RegEx) splitter( 1629 Flag!"keepSeparators" keepSeparators = No.keepSeparators, Range, RegEx)(Range r, RegEx pat) 1630 if ( 1631 is(BasicElementOf!Range : dchar) && isRegexFor!(RegEx, Range)) 1632 { 1633 return Splitter!(keepSeparators, Range, RegEx)(r, pat); 1634 } 1635 1636 /// 1637 @system unittest 1638 { 1639 import std.algorithm.comparison : equal; 1640 auto s1 = ", abc, de, fg, hi, "; 1641 assert(equal(splitter(s1, regex(", *")), 1642 ["", "abc", "de", "fg", "hi", ""])); 1643 } 1644 1645 /// Split on a pattern, but keep the matches in the resulting range 1646 @system unittest 1647 { 1648 import std.algorithm.comparison : equal; 1649 import std.typecons : Yes; 1650 1651 auto pattern = regex(`([\.,])`); 1652 1653 assert("2003.04.05" 1654 .splitter!(Yes.keepSeparators)(pattern) 1655 .equal(["2003", ".", "04", ".", "05"])); 1656 1657 assert(",1,2,3" 1658 .splitter!(Yes.keepSeparators)(pattern) 1659 .equal([",", "1", ",", "2", ",", "3"])); 1660 } 1661 1662 ///An eager version of $(D splitter) that creates an array with splitted slices of $(D input). 1663 public @trusted String[] split(String, RegEx)(String input, RegEx rx) 1664 if (isSomeString!String && isRegexFor!(RegEx, String)) 1665 { 1666 import std.array : appender; 1667 auto a = appender!(String[])(); 1668 foreach (e; splitter(input, rx)) 1669 a.put(e); 1670 return a.data; 1671 } 1672 1673 ///Exception object thrown in case of errors during regex compilation. 1674 public alias RegexException = std.regex.internal.ir.RegexException; 1675 1676 /++ 1677 A range that lazily produces a string output escaped 1678 to be used inside of a regular expression. 1679 +/ 1680 auto escaper(Range)(Range r) 1681 { 1682 import std.algorithm.searching : find; 1683 static immutable escapables = [Escapables]; 1684 static struct Escaper // template to deduce attributes 1685 { 1686 Range r; 1687 bool escaped; 1688 1689 @property ElementType!Range front(){ 1690 if (escaped) 1691 return '\\'; 1692 else 1693 return r.front; 1694 } 1695 1696 @property bool empty(){ return r.empty; } 1697 1698 void popFront(){ 1699 if (escaped) escaped = false; 1700 else 1701 { 1702 r.popFront(); 1703 if (!r.empty && !escapables.find(r.front).empty) 1704 escaped = true; 1705 } 1706 } 1707 1708 @property auto save(){ return Escaper(r.save, escaped); } 1709 } 1710 1711 bool escaped = !r.empty && !escapables.find(r.front).empty; 1712 return Escaper(r, escaped); 1713 } 1714 1715 /// 1716 @system unittest 1717 { 1718 import std.algorithm.comparison; 1719 import std.regex; 1720 string s = `This is {unfriendly} to *regex*`; 1721 assert(s.escaper.equal(`This is \{unfriendly\} to \*regex\*`)); 1722 } 1723 1724 @system unittest 1725 { 1726 import std.algorithm.comparison; 1727 import std.conv; 1728 foreach (S; AliasSeq!(string, wstring, dstring)) 1729 { 1730 auto s = "^".to!S; 1731 assert(s.escaper.equal(`\^`)); 1732 auto s2 = ""; 1733 assert(s2.escaper.equal("")); 1734 } 1735 } 1736