1 /++ 2 $(LINK2 https://en.wikipedia.org/wiki/Regular_expression, Regular expressions) 3 are a commonly used method of pattern matching 4 on strings, with $(I regex) being a catchy word for a pattern in this domain 5 specific language. Typical problems usually solved by regular expressions 6 include validation of user input and the ubiquitous find $(AMP) replace 7 in text processing utilities. 8 9 $(SCRIPT inhibitQuickIndex = 1;) 10 $(DIVC quickindex, 11 $(BOOKTABLE, 12 $(TR $(TH Category) $(TH Functions)) 13 $(TR $(TD Matching) $(TD 14 $(LREF bmatch) 15 $(LREF match) 16 $(LREF matchAll) 17 $(LREF matchFirst) 18 )) 19 $(TR $(TD Building) $(TD 20 $(LREF ctRegex) 21 $(LREF escaper) 22 $(LREF regex) 23 )) 24 $(TR $(TD Replace) $(TD 25 $(LREF replace) 26 $(LREF replaceAll) 27 $(LREF replaceAllInto) 28 $(LREF replaceFirst) 29 $(LREF replaceFirstInto) 30 )) 31 $(TR $(TD Split) $(TD 32 $(LREF split) 33 $(LREF splitter) 34 )) 35 $(TR $(TD Objects) $(TD 36 $(LREF Captures) 37 $(LREF Regex) 38 $(LREF RegexException) 39 $(LREF RegexMatch) 40 $(LREF Splitter) 41 $(LREF StaticRegex) 42 )) 43 )) 44 45 $(SECTION Synopsis) 46 --- 47 import std.regex; 48 import std.stdio; 49 void main() 50 { 51 // Print out all possible dd/mm/yy(yy) dates found in user input. 52 auto r = regex(r"\b[0-9][0-9]?/[0-9][0-9]?/[0-9][0-9](?:[0-9][0-9])?\b"); 53 foreach (line; stdin.byLine) 54 { 55 // matchAll() returns a range that can be iterated 56 // to get all subsequent matches. 57 foreach (c; matchAll(line, r)) 58 writeln(c.hit); 59 } 60 } 61 ... 62 63 // Create a static regex at compile-time, which contains fast native code. 64 auto ctr = ctRegex!(`^.*/([^/]+)/?$`); 65 66 // It works just like a normal regex: 67 auto c2 = matchFirst("foo/bar", ctr); // First match found here, if any 68 assert(!c2.empty); // Be sure to check if there is a match before examining contents! 69 assert(c2[1] == "bar"); // Captures is a range of submatches: 0 = full match. 70 71 ... 72 // multi-pattern regex 73 auto multi = regex([`\d+,\d+`, `([a-z]+):(\d+)`]); 74 auto m = "abc:43 12,34".matchAll(multi); 75 assert(m.front.whichPattern == 2); 76 assert(m.front[1] == "abc"); 77 assert(m.front[2] == "43"); 78 m.popFront(); 79 assert(m.front.whichPattern == 1); 80 assert(m.front[1] == "12"); 81 ... 82 83 // The result of `matchAll/matchFirst` is directly testable with `if/assert/while`, 84 // e.g. test if a string consists of letters only: 85 assert(matchFirst("LettersOnly", `^\p{L}+$`)); 86 87 // And we can take advantage of the ability to define a variable in the $(LINK2 https://dlang.org/spec/statement.html#IfCondition `IfCondition`): 88 if (const auto captures = matchFirst("At l34st one digit, but maybe more...", `((\d)(\d*))`)) 89 { 90 assert(captures[2] == "3"); 91 assert(captures[3] == "4"); 92 assert(captures[1] == "34"); 93 } 94 --- 95 96 $(SECTION Syntax and general information) 97 The general usage guideline is to keep regex complexity on the side of simplicity, 98 as its capabilities reside in purely character-level manipulation. 99 As such it's ill-suited for tasks involving higher level invariants 100 like matching an integer number $(U bounded) in an [a,b] interval. 101 Checks of this sort of are better addressed by additional post-processing. 102 103 The basic syntax shouldn't surprise experienced users of regular expressions. 104 For an introduction to `std.regex` see a 105 $(HTTP dlang.org/regular-expression.html, short tour) of the module API 106 and its abilities. 107 108 There are other web resources on regular expressions to help newcomers, 109 and a good $(HTTP www.regular-expressions.info, reference with tutorial) 110 can easily be found. 111 112 This library uses a remarkably common ECMAScript syntax flavor 113 with the following extensions: 114 $(UL 115 $(LI Named subexpressions, with Python syntax. ) 116 $(LI Unicode properties such as Scripts, Blocks and common binary properties e.g Alphabetic, White_Space, Hex_Digit etc.) 117 $(LI Arbitrary length and complexity lookbehind, including lookahead in lookbehind and vise-versa.) 118 ) 119 120 $(REG_START Pattern syntax ) 121 $(I std.regex operates on codepoint level, 122 'character' in this table denotes a single Unicode codepoint.) 123 $(REG_TABLE 124 $(REG_TITLE Pattern element, Semantics ) 125 $(REG_TITLE Atoms, Match single characters ) 126 $(REG_ROW any character except [{|*+?()^$, Matches the character itself. ) 127 $(REG_ROW ., In single line mode matches any character. 128 Otherwise it matches any character except '\n' and '\r'. ) 129 $(REG_ROW [class], Matches a single character 130 that belongs to this character class. ) 131 $(REG_ROW [^class], Matches a single character that 132 does $(U not) belong to this character class.) 133 $(REG_ROW \cC, Matches the control character corresponding to letter C) 134 $(REG_ROW \xXX, Matches a character with hexadecimal value of XX. ) 135 $(REG_ROW \uXXXX, Matches a character with hexadecimal value of XXXX. ) 136 $(REG_ROW \U00YYYYYY, Matches a character with hexadecimal value of YYYYYY. ) 137 $(REG_ROW \f, Matches a formfeed character. ) 138 $(REG_ROW \n, Matches a linefeed character. ) 139 $(REG_ROW \r, Matches a carriage return character. ) 140 $(REG_ROW \t, Matches a tab character. ) 141 $(REG_ROW \v, Matches a vertical tab character. ) 142 $(REG_ROW \d, Matches any Unicode digit. ) 143 $(REG_ROW \D, Matches any character except Unicode digits. ) 144 $(REG_ROW \w, Matches any word character (note: this includes numbers).) 145 $(REG_ROW \W, Matches any non-word character.) 146 $(REG_ROW \s, Matches whitespace, same as \p{White_Space}.) 147 $(REG_ROW \S, Matches any character except those recognized as $(I \s ). ) 148 $(REG_ROW \\\\, Matches \ character. ) 149 $(REG_ROW \c where c is one of [|*+?(), Matches the character c itself. ) 150 $(REG_ROW \p{PropertyName}, Matches a character that belongs 151 to the Unicode PropertyName set. 152 Single letter abbreviations can be used without surrounding {,}. ) 153 $(REG_ROW \P{PropertyName}, Matches a character that does not belong 154 to the Unicode PropertyName set. 155 Single letter abbreviations can be used without surrounding {,}. ) 156 $(REG_ROW \p{InBasicLatin}, Matches any character that is part of 157 the BasicLatin Unicode $(U block).) 158 $(REG_ROW \P{InBasicLatin}, Matches any character except ones in 159 the BasicLatin Unicode $(U block).) 160 $(REG_ROW \p{Cyrillic}, Matches any character that is part of 161 Cyrillic $(U script).) 162 $(REG_ROW \P{Cyrillic}, Matches any character except ones in 163 Cyrillic $(U script).) 164 $(REG_TITLE Quantifiers, Specify repetition of other elements) 165 $(REG_ROW *, Matches previous character/subexpression 0 or more times. 166 Greedy version - tries as many times as possible.) 167 $(REG_ROW *?, Matches previous character/subexpression 0 or more times. 168 Lazy version - stops as early as possible.) 169 $(REG_ROW +, Matches previous character/subexpression 1 or more times. 170 Greedy version - tries as many times as possible.) 171 $(REG_ROW +?, Matches previous character/subexpression 1 or more times. 172 Lazy version - stops as early as possible.) 173 $(REG_ROW ?, Matches previous character/subexpression 0 or 1 time. 174 Greedy version - tries as many times as possible.) 175 $(REG_ROW ??, Matches previous character/subexpression 0 or 1 time. 176 Lazy version - stops as early as possible.) 177 $(REG_ROW {n}, Matches previous character/subexpression exactly n times. ) 178 $(REG_ROW {n$(COMMA)}, Matches previous character/subexpression n times or more. 179 Greedy version - tries as many times as possible. ) 180 $(REG_ROW {n$(COMMA)}?, Matches previous character/subexpression n times or more. 181 Lazy version - stops as early as possible.) 182 $(REG_ROW {n$(COMMA)m}, Matches previous character/subexpression n to m times. 183 Greedy version - tries as many times as possible, but no more than m times. ) 184 $(REG_ROW {n$(COMMA)m}?, Matches previous character/subexpression n to m times. 185 Lazy version - stops as early as possible, but no less then n times.) 186 $(REG_TITLE Other, Subexpressions $(AMP) alternations ) 187 $(REG_ROW (regex), Matches subexpression regex, 188 saving matched portion of text for later retrieval. ) 189 $(REG_ROW (?#comment), An inline comment that is ignored while matching.) 190 $(REG_ROW (?:regex), Matches subexpression regex, 191 $(U not) saving matched portion of text. Useful to speed up matching. ) 192 $(REG_ROW A|B, Matches subexpression A, or failing that, matches B. ) 193 $(REG_ROW (?P$(LT)name$(GT)regex), Matches named subexpression 194 regex labeling it with name 'name'. 195 When referring to a matched portion of text, 196 names work like aliases in addition to direct numbers. 197 ) 198 $(REG_TITLE Assertions, Match position rather than character ) 199 $(REG_ROW ^, Matches at the begining of input or line (in multiline mode).) 200 $(REG_ROW $, Matches at the end of input or line (in multiline mode). ) 201 $(REG_ROW \b, Matches at word boundary. ) 202 $(REG_ROW \B, Matches when $(U not) at word boundary. ) 203 $(REG_ROW (?=regex), Zero-width lookahead assertion. 204 Matches at a point where the subexpression 205 regex could be matched starting from the current position. 206 ) 207 $(REG_ROW (?!regex), Zero-width negative lookahead assertion. 208 Matches at a point where the subexpression 209 regex could $(U not) be matched starting from the current position. 210 ) 211 $(REG_ROW (?<=regex), Zero-width lookbehind assertion. Matches at a point 212 where the subexpression regex could be matched ending 213 at the current position (matching goes backwards). 214 ) 215 $(REG_ROW (?<!regex), Zero-width negative lookbehind assertion. 216 Matches at a point where the subexpression regex could $(U not) 217 be matched ending at the current position (matching goes backwards). 218 ) 219 ) 220 221 $(REG_START Character classes ) 222 $(REG_TABLE 223 $(REG_TITLE Pattern element, Semantics ) 224 $(REG_ROW Any atom, Has the same meaning as outside of a character class, 225 except for ] which must be written as \\]) 226 $(REG_ROW a-z, Includes characters a, b, c, ..., z. ) 227 $(REG_ROW [a||b]$(COMMA) [a--b]$(COMMA) [a~~b]$(COMMA) [a$(AMP)$(AMP)b], 228 Where a, b are arbitrary classes, means union, set difference, 229 symmetric set difference, and intersection respectively. 230 $(I Any sequence of character class elements implicitly forms a union.) ) 231 ) 232 233 $(REG_START Regex flags ) 234 $(REG_TABLE 235 $(REG_TITLE Flag, Semantics ) 236 $(REG_ROW g, Global regex, repeat over the whole input. ) 237 $(REG_ROW i, Case insensitive matching. ) 238 $(REG_ROW m, Multi-line mode, match ^, $ on start and end line separators 239 as well as start and end of input.) 240 $(REG_ROW s, Single-line mode, makes . match '\n' and '\r' as well. ) 241 $(REG_ROW x, Free-form syntax, ignores whitespace in pattern, 242 useful for formatting complex regular expressions. ) 243 ) 244 245 $(SECTION Unicode support) 246 247 This library provides full Level 1 support* according to 248 $(HTTP unicode.org/reports/tr18/, UTS 18). Specifically: 249 $(UL 250 $(LI 1.1 Hex notation via any of \uxxxx, \U00YYYYYY, \xZZ.) 251 $(LI 1.2 Unicode properties.) 252 $(LI 1.3 Character classes with set operations.) 253 $(LI 1.4 Word boundaries use the full set of "word" characters.) 254 $(LI 1.5 Using simple casefolding to match case 255 insensitively across the full range of codepoints.) 256 $(LI 1.6 Respecting line breaks as any of 257 \u000A | \u000B | \u000C | \u000D | \u0085 | \u2028 | \u2029 | \u000D\u000A.) 258 $(LI 1.7 Operating on codepoint level.) 259 ) 260 *With exception of point 1.1.1, as of yet, normalization of input 261 is expected to be enforced by user. 262 263 $(SECTION Replace format string) 264 265 A set of functions in this module that do the substitution rely 266 on a simple format to guide the process. In particular the table below 267 applies to the `format` argument of 268 $(LREF replaceFirst) and $(LREF replaceAll). 269 270 The format string can reference parts of match using the following notation. 271 $(REG_TABLE 272 $(REG_TITLE Format specifier, Replaced by ) 273 $(REG_ROW $(DOLLAR)$(AMP), the whole match. ) 274 $(REG_ROW $(DOLLAR)$(BACKTICK), part of input $(I preceding) the match. ) 275 $(REG_ROW $', part of input $(I following) the match. ) 276 $(REG_ROW $$, '$' character. ) 277 $(REG_ROW \c $(COMMA) where c is any character, the character c itself. ) 278 $(REG_ROW \\\\, '\\' character. ) 279 $(REG_ROW $(DOLLAR)1 .. $(DOLLAR)99, submatch number 1 to 99 respectively. ) 280 ) 281 282 $(SECTION Slicing and zero memory allocations orientation) 283 284 All matches returned by pattern matching functionality in this library 285 are slices of the original input. The notable exception is the `replace` 286 family of functions that generate a new string from the input. 287 288 In cases where producing the replacement is the ultimate goal 289 $(LREF replaceFirstInto) and $(LREF replaceAllInto) could come in handy 290 as functions that avoid allocations even for replacement. 291 292 Copyright: Copyright Dmitry Olshansky, 2011- 293 294 License: $(HTTP boost.org/LICENSE_1_0.txt, Boost License 1.0). 295 296 Authors: Dmitry Olshansky, 297 298 API and utility constructs are modeled after the original `std.regex` 299 by Walter Bright and Andrei Alexandrescu. 300 301 Source: $(PHOBOSSRC std/regex/package.d) 302 303 Macros: 304 REG_ROW = $(TR $(TD $(I $1 )) $(TD $+) ) 305 REG_TITLE = $(TR $(TD $(B $1)) $(TD $(B $2)) ) 306 REG_TABLE = <table border="1" cellspacing="0" cellpadding="5" > $0 </table> 307 REG_START = <h3><div align="center"> $0 </div></h3> 308 SECTION = <h3><a id="$1" href="#$1" class="anchor">$0</a></h3> 309 S_LINK = <a href="#$1">$+</a> 310 +/ 311 module std.regex; 312 313 import std.range.primitives, std.traits; 314 import std.regex.internal.ir; 315 import std.typecons : Flag, Yes, No; 316 317 /++ 318 `Regex` object holds regular expression pattern in compiled form. 319 320 Instances of this object are constructed via calls to `regex`. 321 This is an intended form for caching and storage of frequently 322 used regular expressions. 323 324 Example: 325 326 Test if this object doesn't contain any compiled pattern. 327 --- 328 Regex!char r; 329 assert(r.empty); 330 r = regex(""); // Note: "" is a valid regex pattern. 331 assert(!r.empty); 332 --- 333 334 Getting a range of all the named captures in the regex. 335 ---- 336 import std.range; 337 import std.algorithm; 338 339 auto re = regex(`(?P<name>\w+) = (?P<var>\d+)`); 340 auto nc = re.namedCaptures; 341 static assert(isRandomAccessRange!(typeof(nc))); 342 assert(!nc.empty); 343 assert(nc.length == 2); 344 assert(nc.equal(["name", "var"])); 345 assert(nc[0] == "name"); 346 assert(nc[1..$].equal(["var"])); 347 ---- 348 +/ 349 public alias Regex(Char) = std.regex.internal.ir.Regex!(Char); 350 351 /++ 352 A `StaticRegex` is `Regex` object that contains D code specially 353 generated at compile-time to speed up matching. 354 355 No longer used, kept as alias to Regex for backwards compatibility. 356 +/ 357 public alias StaticRegex = Regex; 358 359 /++ 360 Compile regular expression pattern for the later execution. 361 Returns: `Regex` object that works on inputs having 362 the same character width as `pattern`. 363 364 Params: 365 pattern = A single regular expression to match. 366 patterns = An array of regular expression strings. 367 The resulting `Regex` object will match any expression; 368 use $(LREF whichPattern) to know which. 369 flags = The _attributes (g, i, m, s and x accepted) 370 371 Throws: `RegexException` if there were any errors during compilation. 372 +/ 373 @trusted public auto regex(S : C[], C)(const S[] patterns, const(char)[] flags="") 374 if (isSomeString!(S)) 375 { 376 import std.array : appender; 377 import std.functional : memoize; 378 enum cacheSize = 8; //TODO: invent nice interface to control regex caching 379 const(C)[] pat; 380 if (patterns.length > 1) 381 { 382 auto app = appender!S(); foreach(i,p;patterns)383 foreach (i, p; patterns) 384 { 385 if (i != 0) 386 app.put("|"); 387 app.put("(?:"); 388 app.put(patterns[i]); 389 // terminator for the pattern 390 // to detect if the pattern unexpectedly ends 391 app.put("\\"); 392 app.put(cast(dchar)(privateUseStart+i)); 393 app.put(")"); 394 // another one to return correct whichPattern 395 // for all of potential alternatives in the patterns[i] 396 app.put("\\"); 397 app.put(cast(dchar)(privateUseStart+i)); 398 } 399 pat = app.data; 400 } 401 else 402 pat = patterns[0]; 403 404 if (__ctfe) 405 return regexImpl(pat, flags); 406 return memoize!(regexImpl!S, cacheSize)(pat, flags); 407 } 408 409 ///ditto 410 @trusted public auto regex(S)(S pattern, const(char)[] flags="") 411 if (isSomeString!(S)) 412 { 413 return regex([pattern], flags); 414 } 415 416 /// 417 @system unittest 418 { 419 void test(S)() 420 { 421 // multi-pattern regex example 422 S[] arr = [`([a-z]+):(\d+)`, `(\d+),\d+`]; 423 auto multi = regex(arr); // multi regex 424 S str = "abc:43 12,34"; 425 auto m = str.matchAll(multi); 426 assert(m.front.whichPattern == 1); 427 assert(m.front[1] == "abc"); 428 assert(m.front[2] == "43"); 429 m.popFront(); 430 assert(m.front.whichPattern == 2); 431 assert(m.front[1] == "12"); 432 } 433 434 import std.meta : AliasSeq; 435 static foreach (C; AliasSeq!(string, wstring, dstring)) 436 // Test with const array of patterns - see https://issues.dlang.org/show_bug.cgi?id=20301 437 static foreach (S; AliasSeq!(C, const C, immutable C)) 438 test!S(); 439 } 440 441 @system unittest 442 { 443 import std.conv : to; 444 import std.string : indexOf; 445 446 immutable pattern = "s+"; 447 auto regexString = to!string(regex(pattern, "U")); 448 assert(regexString.length <= pattern.length + 100, "String representation shouldn't be unreasonably bloated."); 449 assert(indexOf(regexString, "s+") >= 0, "String representation should include pattern."); 450 assert(indexOf(regexString, 'U') >= 0, "String representation should include flags."); 451 } 452 453 public auto regexImpl(S)(const S pattern, const(char)[] flags="") 454 if (isSomeString!(typeof(pattern))) 455 { 456 import std.regex.internal.parser : Parser, CodeGen; 457 auto parser = Parser!(Unqual!(typeof(pattern)), CodeGen)(pattern, flags); 458 auto r = parser.program; 459 return r; 460 } 461 462 463 private struct CTRegexWrapper(Char) 464 { 465 private immutable(Regex!Char)* re; 466 467 // allow code that expects mutable Regex to still work 468 // we stay "logically const" 469 @property @trusted ref getRe() const { return *cast(Regex!Char*) re; } 470 alias getRe this; 471 } 472 473 template ctRegexImpl(alias pattern, string flags=[]) 474 { 475 import std.regex.internal.backtracking, std.regex.internal.parser; 476 static immutable r = cast(immutable) regex(pattern, flags); 477 alias Char = BasicElementOf!(typeof(pattern)); 478 enum source = ctGenRegExCode(r); 479 @trusted pure bool func(BacktrackingMatcher!Char matcher) 480 { 481 debug(std_regex_ctr) pragma(msg, source); 482 cast(void) matcher; 483 mixin(source); 484 } 485 static immutable staticRe = 486 cast(immutable) r.withFactory(new CtfeFactory!(BacktrackingMatcher, Char, func)); 487 enum wrapper = CTRegexWrapper!Char(&staticRe); 488 } 489 490 @safe pure unittest 491 { 492 // test compat for logical const workaround 493 static void test(StaticRegex!char) 494 { 495 } 496 enum re = ctRegex!``; 497 test(re); 498 } 499 500 @safe pure unittest 501 { 502 auto re = ctRegex!`foo`; 503 assert(matchFirst("foo", re)); 504 505 // test reassignment 506 re = ctRegex!`bar`; 507 assert(matchFirst("bar", re)); 508 assert(!matchFirst("bar", ctRegex!`foo`)); 509 } 510 511 /++ 512 Compile regular expression using CTFE 513 and generate optimized native machine code for matching it. 514 515 Returns: StaticRegex object for faster matching. 516 517 Params: 518 pattern = Regular expression 519 flags = The _attributes (g, i, m, s and x accepted) 520 +/ 521 public enum ctRegex(alias pattern, alias flags=[]) = ctRegexImpl!(pattern, flags).wrapper; 522 523 enum isRegexFor(RegEx, R) = is(immutable RegEx == immutable Regex!(BasicElementOf!R)) 524 || is(RegEx : const(Regex!(BasicElementOf!R))) 525 || is(immutable RegEx == immutable StaticRegex!(BasicElementOf!R)); 526 527 528 /++ 529 `Captures` object contains submatches captured during a call 530 to `match` or iteration over `RegexMatch` range. 531 532 First element of range is the whole match. 533 +/ 534 @trusted public struct Captures(R) 535 if (isSomeString!R) 536 {//@trusted because of union inside 537 alias DataIndex = size_t; 538 alias String = R; 539 alias Store = SmallFixedArray!(Group!DataIndex, 3); 540 private: 541 import std.conv : text; 542 Store matches; 543 const(NamedGroup)[] _names; 544 R _input; 545 int _nMatch; 546 uint _f, _b; 547 548 this(R input, uint n, const(NamedGroup)[] named) 549 { 550 _input = input; 551 _names = named; 552 matches = Store(n); 553 _b = n; 554 _f = 0; 555 } 556 557 this(ref RegexMatch!R rmatch) 558 { 559 _input = rmatch._input; 560 _names = rmatch._engine.pattern.dict; 561 immutable n = rmatch._engine.pattern.ngroup; 562 matches = Store(n); 563 _b = n; 564 _f = 0; 565 } 566 567 inout(R) getMatch(size_t index) inout 568 { 569 auto m = &matches[index]; 570 return *m ? _input[m.begin .. m.end] : null; 571 } 572 573 public: 574 ///Slice of input prior to the match. 575 @property R pre() 576 { 577 return _nMatch == 0 ? _input[] : _input[0 .. matches[0].begin]; 578 } 579 580 ///Slice of input immediately after the match. 581 @property R post() 582 { 583 return _nMatch == 0 ? _input[] : _input[matches[0].end .. $]; 584 } 585 586 ///Slice of matched portion of input. 587 @property R hit() 588 { 589 assert(_nMatch, "attempted to get hit of an empty match"); 590 return _input[matches[0].begin .. matches[0].end]; 591 } 592 593 ///Range interface. 594 @property R front() 595 { 596 assert(_nMatch, "attempted to get front of an empty match"); 597 return getMatch(_f); 598 } 599 600 ///ditto 601 @property R back() 602 { 603 assert(_nMatch, "attempted to get back of an empty match"); 604 return getMatch(_b - 1); 605 } 606 607 ///ditto 608 void popFront() 609 { 610 assert(!empty); 611 ++_f; 612 } 613 614 ///ditto 615 void popBack() 616 { 617 assert(!empty); 618 --_b; 619 } 620 621 ///ditto 622 @property bool empty() const { return _nMatch == 0 || _f >= _b; } 623 624 ///ditto 625 inout(R) opIndex()(size_t i) inout 626 { 627 assert(_f + i < _b,text("requested submatch number ", i," is out of range")); 628 return getMatch(_f + i); 629 } 630 631 /++ 632 Explicit cast to bool. 633 Useful as a shorthand for !(x.empty) in if and assert statements. 634 635 --- 636 import std.regex; 637 638 assert(!matchFirst("nothing", "something")); 639 --- 640 +/ 641 642 @safe bool opCast(T:bool)() const nothrow { return _nMatch != 0; } 643 644 /++ 645 Number of pattern matched counting, where 1 - the first pattern. 646 Returns 0 on no match. 647 +/ 648 649 @safe @property int whichPattern() const nothrow { return _nMatch; } 650 651 /// 652 @system unittest 653 { 654 import std.regex; 655 assert(matchFirst("abc", "[0-9]+", "[a-z]+").whichPattern == 2); 656 } 657 658 /++ 659 Lookup named submatch. 660 661 --- 662 import std.regex; 663 import std.range; 664 665 auto c = matchFirst("a = 42;", regex(`(?P<var>\w+)\s*=\s*(?P<value>\d+);`)); 666 assert(c["var"] == "a"); 667 assert(c["value"] == "42"); 668 popFrontN(c, 2); 669 //named groups are unaffected by range primitives 670 assert(c["var"] =="a"); 671 assert(c.front == "42"); 672 ---- 673 +/ 674 R opIndex(String)(String i) /*const*/ //@@@BUG@@@ 675 if (isSomeString!String) 676 { 677 size_t index = lookupNamedGroup(_names, i); 678 return getMatch(index); 679 } 680 681 ///Number of matches in this object. length()682 @property size_t length() const { return _nMatch == 0 ? 0 : _b - _f; } 683 684 ///A hook for compatibility with original std.regex. 685 @property ref captures(){ return this; } 686 } 687 688 /// 689 @system unittest 690 { 691 import std.range.primitives : popFrontN; 692 693 auto c = matchFirst("@abc#", regex(`(\w)(\w)(\w)`)); 694 assert(c.pre == "@"); // Part of input preceding match 695 assert(c.post == "#"); // Immediately after match 696 assert(c.hit == c[0] && c.hit == "abc"); // The whole match 697 assert(c[2] == "b"); 698 assert(c.front == "abc"); 699 c.popFront(); 700 assert(c.front == "a"); 701 assert(c.back == "c"); 702 c.popBack(); 703 assert(c.back == "b"); 704 popFrontN(c, 2); 705 assert(c.empty); 706 707 assert(!matchFirst("nothing", "something")); 708 709 // Captures that are not matched will be null. 710 c = matchFirst("ac", regex(`a(b)?c`)); 711 assert(c); 712 assert(!c[1]); 713 } 714 715 @system unittest 716 { 717 Captures!string c; 718 string s = "abc"; 719 assert(cast(bool)(c = matchFirst(s, regex("d"))) 720 || cast(bool)(c = matchFirst(s, regex("a")))); 721 } 722 723 // https://issues.dlang.org/show_bug.cgi?id=19979 724 @system unittest 725 { 726 auto c = matchFirst("bad", regex(`(^)(not )?bad($)`)); 727 assert(c[0] && c[0].length == "bad".length); 728 assert(c[1] && !c[1].length); 729 assert(!c[2]); 730 assert(c[3] && !c[3].length); 731 } 732 733 /++ 734 A regex engine state, as returned by `match` family of functions. 735 736 Effectively it's a forward range of Captures!R, produced 737 by lazily searching for matches in a given input. 738 +/ 739 @trusted public struct RegexMatch(R) 740 if (isSomeString!R) 741 { 742 import std.typecons : Rebindable; 743 private: 744 alias Char = BasicElementOf!R; 745 Matcher!Char _engine; 746 Rebindable!(const MatcherFactory!Char) _factory; 747 R _input; 748 Captures!R _captures; 749 750 this(RegEx)(R input, RegEx prog) 751 { 752 import std.exception : enforce; 753 _input = input; 754 if (prog.factory is null) _factory = defaultFactory!Char(prog); 755 else _factory = prog.factory; 756 _engine = _factory.create(prog, input); 757 assert(_engine.refCount == 1); 758 _captures = Captures!R(this); 759 _captures.matches.mutate((slice) pure { _captures._nMatch = _engine.match(slice); }); 760 } 761 762 public: 763 this(this) 764 { 765 if (_engine) _factory.incRef(_engine); 766 } 767 768 ~this() 769 { 770 if (_engine) _factory.decRef(_engine); 771 } 772 773 ///Shorthands for front.pre, front.post, front.hit. 774 @property R pre() 775 { 776 return _captures.pre; 777 } 778 779 ///ditto 780 @property R post() 781 { 782 return _captures.post; 783 } 784 785 ///ditto 786 @property R hit() 787 { 788 return _captures.hit; 789 } 790 791 /++ 792 Functionality for processing subsequent matches of global regexes via range interface: 793 --- 794 import std.regex; 795 auto m = matchAll("Hello, world!", regex(`\w+`)); 796 assert(m.front.hit == "Hello"); 797 m.popFront(); 798 assert(m.front.hit == "world"); 799 m.popFront(); 800 assert(m.empty); 801 --- 802 +/ 803 @property inout(Captures!R) front() inout 804 { 805 return _captures; 806 } 807 808 ///ditto 809 void popFront() 810 { 811 import std.exception : enforce; 812 // CoW - if refCount is not 1, we are aliased by somebody else 813 if (_engine.refCount != 1) 814 { 815 // we create a new engine & abandon this reference 816 auto old = _engine; 817 _engine = _factory.dup(old, _input); 818 _factory.decRef(old); 819 } 820 _captures.matches.mutate((slice) { _captures._nMatch = _engine.match(slice); }); 821 } 822 823 ///ditto 824 auto save(){ return this; } 825 826 ///Test if this match object is empty. 827 @property bool empty() const { return _captures._nMatch == 0; } 828 829 ///Same as !(x.empty), provided for its convenience in conditional statements. 830 T opCast(T:bool)(){ return !empty; } 831 832 /// Same as .front, provided for compatibility with original std.regex. 833 @property inout(Captures!R) captures() inout { return _captures; } 834 } 835 836 private auto matchOnceImpl(RegEx, R)(R input, const auto ref RegEx prog) @trusted 837 { 838 alias Char = BasicElementOf!R; 839 static struct Key 840 { 841 immutable(Char)[] pattern; 842 uint flags; 843 } 844 static Key cacheKey = Key("", -1); 845 static Matcher!Char cache; 846 auto factory = prog.factory is null ? defaultFactory!Char(prog) : prog.factory; 847 auto key = Key(prog.pattern, prog.flags); 848 Matcher!Char engine; 849 if (cacheKey == key) 850 { 851 engine = cache; 852 engine.rearm(input); 853 } 854 else 855 { 856 engine = factory.create(prog, input); 857 if (cache) factory.decRef(cache); // destroy cached engine *after* building a new one 858 cache = engine; 859 cacheKey = key; 860 } 861 auto captures = Captures!R(input, prog.ngroup, prog.dict); 862 captures.matches.mutate((slice) pure { captures._nMatch = engine.match(slice); }); 863 return captures; 864 } 865 866 // matchOnce is constructed as a safe, pure wrapper over matchOnceImpl. It can be 867 // faked as pure because the static mutable variables are used to cache the key and 868 // character matcher. The technique used avoids delegates and GC. 869 private @safe auto matchOnce(RegEx, R)(R input, const auto ref RegEx prog) pure 870 { 871 static auto impl(R input, const ref RegEx prog) 872 { 873 return matchOnceImpl(input, prog); 874 } 875 876 static @trusted auto pureImpl(R input, const ref RegEx prog) 877 { 878 auto p = assumePureFunction(&impl); 879 return p(input, prog); 880 } 881 882 return pureImpl(input, prog); 883 } 884 885 private auto matchMany(RegEx, R)(R input, auto ref RegEx re) @safe 886 { 887 return RegexMatch!R(input, re.withFlags(re.flags | RegexOption.global)); 888 } 889 890 @system unittest 891 { 892 //sanity checks for new API 893 auto re = regex("abc"); 894 assert(!"abc".matchOnce(re).empty); 895 assert("abc".matchOnce(re)[0] == "abc"); 896 } 897 898 // https://issues.dlang.org/show_bug.cgi?id=18135 899 @system unittest 900 { 901 static struct MapResult { RegexMatch!string m; } 902 MapResult m; 903 m = MapResult(); 904 assert(m == m); 905 } 906 907 private enum isReplaceFunctor(alias fun, R) = 908 __traits(compiles, (Captures!R c) { fun(c); }); 909 910 // the lowest level - just stuff replacements into the sink 911 private @trusted void replaceCapturesInto(alias output, Sink, R, T) 912 (ref Sink sink, R input, T captures) 913 if (isOutputRange!(Sink, dchar) && isSomeString!R) 914 { 915 if (captures.empty) 916 { 917 sink.put(input); 918 return; 919 } 920 sink.put(captures.pre); 921 // a hack to get around bogus errors, should be simply output(captures, sink) 922 // "is a nested function and cannot be accessed from" 923 static if (isReplaceFunctor!(output, R)) 924 sink.put(output(captures)); //"mutator" type of function 925 else 926 output(captures, sink); //"output" type of function 927 sink.put(captures.post); 928 } 929 930 // ditto for a range of captures 931 private void replaceMatchesInto(alias output, Sink, R, T) 932 (ref Sink sink, R input, T matches) 933 if (isOutputRange!(Sink, dchar) && isSomeString!R) 934 { 935 size_t offset = 0; 936 foreach (cap; matches) 937 { 938 sink.put(cap.pre[offset .. $]); 939 // same hack, see replaceCapturesInto 940 static if (isReplaceFunctor!(output, R)) 941 sink.put(output(cap)); //"mutator" type of function 942 else 943 output(cap, sink); //"output" type of function 944 offset = cap.pre.length + cap.hit.length; 945 } 946 sink.put(input[offset .. $]); 947 } 948 949 // a general skeleton of replaceFirst 950 private R replaceFirstWith(alias output, R, RegEx)(R input, RegEx re) 951 if (isSomeString!R && isRegexFor!(RegEx, R)) 952 { 953 import std.array : appender; 954 auto data = matchFirst(input, re); 955 if (data.empty) 956 return input; 957 auto app = appender!(R)(); 958 replaceCapturesInto!output(app, input, data); 959 return app.data; 960 } 961 962 // ditto for replaceAll 963 // the method parameter allows old API to ride on the back of the new one 964 private R replaceAllWith(alias output, 965 alias method=matchAll, R, RegEx)(R input, RegEx re) 966 if (isSomeString!R && isRegexFor!(RegEx, R)) 967 { 968 import std.array : appender; 969 auto matches = method(input, re); //inout(C)[] fails 970 if (matches.empty) 971 return input; 972 auto app = appender!(R)(); 973 replaceMatchesInto!output(app, input, matches); 974 return app.data; 975 } 976 977 978 /++ 979 Start matching `input` to regex pattern `re`, 980 using Thompson NFA matching scheme. 981 982 The use of this function is $(RED discouraged) - use either of 983 $(LREF matchAll) or $(LREF matchFirst). 984 985 Delegating the kind of operation 986 to "g" flag is soon to be phased out along with the 987 ability to choose the exact matching scheme. The choice of 988 matching scheme to use depends highly on the pattern kind and 989 can done automatically on case by case basis. 990 991 Returns: a `RegexMatch` object holding engine state after first match. 992 +/ 993 994 public auto match(R, RegEx)(R input, RegEx re) 995 if (isSomeString!R && isRegexFor!(RegEx,R)) 996 { 997 return RegexMatch!(Unqual!(typeof(input)))(input, re); 998 } 999 1000 ///ditto 1001 public auto match(R, String)(R input, String re) 1002 if (isSomeString!R && isSomeString!String) 1003 { 1004 return RegexMatch!(Unqual!(typeof(input)))(input, regex(re)); 1005 } 1006 1007 /++ 1008 Find the first (leftmost) slice of the `input` that 1009 matches the pattern `re`. This function picks the most suitable 1010 regular expression engine depending on the pattern properties. 1011 1012 `re` parameter can be one of three types: 1013 $(UL 1014 $(LI Plain string(s), in which case it's compiled to bytecode before matching. ) 1015 $(LI Regex!char (wchar/dchar) that contains a pattern in the form of 1016 compiled bytecode. ) 1017 $(LI StaticRegex!char (wchar/dchar) that contains a pattern in the form of 1018 compiled native machine code. ) 1019 ) 1020 1021 Returns: 1022 $(LREF Captures) containing the extent of a match together with all submatches 1023 if there was a match, otherwise an empty $(LREF Captures) object. 1024 +/ 1025 public auto matchFirst(R, RegEx)(R input, RegEx re) 1026 if (isSomeString!R && isRegexFor!(RegEx, R)) 1027 { 1028 return matchOnce(input, re); 1029 } 1030 1031 ///ditto 1032 public auto matchFirst(R, String)(R input, String re) 1033 if (isSomeString!R && isSomeString!String) 1034 { 1035 return matchOnce(input, regex(re)); 1036 } 1037 1038 ///ditto 1039 public auto matchFirst(R, String)(R input, String[] re...) 1040 if (isSomeString!R && isSomeString!String) 1041 { 1042 return matchOnce(input, regex(re)); 1043 } 1044 1045 /++ 1046 Initiate a search for all non-overlapping matches to the pattern `re` 1047 in the given `input`. The result is a lazy range of matches generated 1048 as they are encountered in the input going left to right. 1049 1050 This function picks the most suitable regular expression engine 1051 depending on the pattern properties. 1052 1053 `re` parameter can be one of three types: 1054 $(UL 1055 $(LI Plain string(s), in which case it's compiled to bytecode before matching. ) 1056 $(LI Regex!char (wchar/dchar) that contains a pattern in the form of 1057 compiled bytecode. ) 1058 $(LI StaticRegex!char (wchar/dchar) that contains a pattern in the form of 1059 compiled native machine code. ) 1060 ) 1061 1062 Returns: 1063 $(LREF RegexMatch) object that represents matcher state 1064 after the first match was found or an empty one if not present. 1065 +/ 1066 public auto matchAll(R, RegEx)(R input, RegEx re) 1067 if (isSomeString!R && isRegexFor!(RegEx, R)) 1068 { 1069 return matchMany(input, re); 1070 } 1071 1072 ///ditto 1073 public auto matchAll(R, String)(R input, String re) 1074 if (isSomeString!R && isSomeString!String) 1075 { 1076 return matchMany(input, regex(re)); 1077 } 1078 1079 ///ditto 1080 public auto matchAll(R, String)(R input, String[] re...) 1081 if (isSomeString!R && isSomeString!String) 1082 { 1083 return matchMany(input, regex(re)); 1084 } 1085 1086 // another set of tests just to cover the new API 1087 @system unittest 1088 { 1089 import std.algorithm.comparison : equal; 1090 import std.algorithm.iteration : map; 1091 import std.conv : to; 1092 1093 static foreach (String; AliasSeq!(string, wstring, const(dchar)[])) 1094 {{ 1095 auto str1 = "blah-bleh".to!String(); 1096 auto pat1 = "bl[ae]h".to!String(); 1097 auto mf = matchFirst(str1, pat1); 1098 assert(mf.equal(["blah".to!String()])); 1099 auto mAll = matchAll(str1, pat1); 1100 assert(mAll.equal!((a,b) => a.equal(b)) 1101 ([["blah".to!String()], ["bleh".to!String()]])); 1102 1103 auto str2 = "1/03/12 - 3/03/12".to!String(); 1104 auto pat2 = regex([r"(\d+)/(\d+)/(\d+)".to!String(), "abc".to!String]); 1105 auto mf2 = matchFirst(str2, pat2); 1106 assert(mf2.equal(["1/03/12", "1", "03", "12"].map!(to!String)())); 1107 auto mAll2 = matchAll(str2, pat2); 1108 assert(mAll2.front.equal(mf2)); 1109 mAll2.popFront(); 1110 assert(mAll2.front.equal(["3/03/12", "3", "03", "12"].map!(to!String)())); 1111 mf2.popFrontN(3); 1112 assert(mf2.equal(["12".to!String()])); 1113 1114 auto ctPat = ctRegex!(`(?P<Quot>\d+)/(?P<Denom>\d+)`.to!String()); 1115 auto str = "2 + 34/56 - 6/1".to!String(); 1116 auto cmf = matchFirst(str, ctPat); 1117 assert(cmf.equal(["34/56", "34", "56"].map!(to!String)())); 1118 assert(cmf["Quot"] == "34".to!String()); 1119 assert(cmf["Denom"] == "56".to!String()); 1120 1121 auto cmAll = matchAll(str, ctPat); 1122 assert(cmAll.front.equal(cmf)); 1123 cmAll.popFront(); 1124 assert(cmAll.front.equal(["6/1", "6", "1"].map!(to!String)())); 1125 }} 1126 } 1127 1128 /++ 1129 Start matching of `input` to regex pattern `re`, 1130 using traditional $(LINK2 https://en.wikipedia.org/wiki/Backtracking, 1131 backtracking) matching scheme. 1132 1133 The use of this function is $(RED discouraged) - use either of 1134 $(LREF matchAll) or $(LREF matchFirst). 1135 1136 Delegating the kind of operation 1137 to "g" flag is soon to be phased out along with the 1138 ability to choose the exact matching scheme. The choice of 1139 matching scheme to use depends highly on the pattern kind and 1140 can done automatically on case by case basis. 1141 1142 Returns: a `RegexMatch` object holding engine 1143 state after first match. 1144 1145 +/ 1146 public auto bmatch(R, RegEx)(R input, RegEx re) 1147 if (isSomeString!R && isRegexFor!(RegEx, R)) 1148 { 1149 return RegexMatch!(Unqual!(typeof(input)))(input, re); 1150 } 1151 1152 ///ditto 1153 public auto bmatch(R, String)(R input, String re) 1154 if (isSomeString!R && isSomeString!String) 1155 { 1156 return RegexMatch!(Unqual!(typeof(input)))(input, regex(re)); 1157 } 1158 1159 // produces replacement string from format using captures for substitution 1160 package void replaceFmt(R, Capt, OutR) 1161 (R format, Capt captures, OutR sink, bool ignoreBadSubs = false) 1162 if (isOutputRange!(OutR, ElementEncodingType!R[]) && 1163 isOutputRange!(OutR, ElementEncodingType!(Capt.String)[])) 1164 { 1165 import std.algorithm.searching : find; 1166 import std.ascii : isDigit, isAlpha; 1167 import std.conv : text, parse; 1168 import std.exception : enforce; 1169 enum State { Normal, Dollar } 1170 auto state = State.Normal; 1171 size_t offset; 1172 L_Replace_Loop: 1173 while (!format.empty) 1174 final switch (state) 1175 { 1176 case State.Normal: 1177 for (offset = 0; offset < format.length; offset++)//no decoding 1178 { 1179 if (format[offset] == '$') 1180 { 1181 state = State.Dollar; 1182 sink.put(format[0 .. offset]); 1183 format = format[offset+1 .. $];//ditto 1184 continue L_Replace_Loop; 1185 } 1186 } 1187 sink.put(format[0 .. offset]); 1188 format = format[offset .. $]; 1189 break; 1190 case State.Dollar: 1191 if (isDigit(format[0])) 1192 { 1193 uint digit = parse!uint(format); 1194 enforce(ignoreBadSubs || digit < captures.length, text("invalid submatch number ", digit)); 1195 if (digit < captures.length) 1196 sink.put(captures[digit]); 1197 } 1198 else if (format[0] == '{') 1199 { 1200 auto x = find!(a => !isAlpha(a))(format[1..$]); 1201 enforce(!x.empty && x[0] == '}', "no matching '}' in replacement format"); 1202 auto name = format[1 .. $ - x.length]; 1203 format = x[1..$]; 1204 enforce(!name.empty, "invalid name in ${...} replacement format"); 1205 sink.put(captures[name]); 1206 } 1207 else if (format[0] == '&') 1208 { 1209 sink.put(captures[0]); 1210 format = format[1 .. $]; 1211 } 1212 else if (format[0] == '`') 1213 { 1214 sink.put(captures.pre); 1215 format = format[1 .. $]; 1216 } 1217 else if (format[0] == '\'') 1218 { 1219 sink.put(captures.post); 1220 format = format[1 .. $]; 1221 } 1222 else if (format[0] == '$') 1223 { 1224 sink.put(format[0 .. 1]); 1225 format = format[1 .. $]; 1226 } 1227 state = State.Normal; 1228 break; 1229 } 1230 enforce(state == State.Normal, "invalid format string in regex replace"); 1231 } 1232 1233 /++ 1234 Construct a new string from `input` by replacing the first match with 1235 a string generated from it according to the `format` specifier. 1236 1237 To replace all matches use $(LREF replaceAll). 1238 1239 Params: 1240 input = string to search 1241 re = compiled regular expression to use 1242 format = _format string to generate replacements from, 1243 see $(S_LINK Replace _format string, the _format string). 1244 1245 Returns: 1246 A string of the same type with the first match (if any) replaced. 1247 If no match is found returns the input string itself. 1248 +/ 1249 public R replaceFirst(R, C, RegEx)(R input, RegEx re, const(C)[] format) 1250 if (isSomeString!R && is(C : dchar) && isRegexFor!(RegEx, R)) 1251 { 1252 return replaceFirstWith!((m, sink) => replaceFmt(format, m, sink))(input, re); 1253 } 1254 1255 /// 1256 @system unittest 1257 { 1258 assert(replaceFirst("noon", regex("n"), "[$&]") == "[n]oon"); 1259 } 1260 1261 /++ 1262 This is a general replacement tool that construct a new string by replacing 1263 matches of pattern `re` in the `input`. Unlike the other overload 1264 there is no format string instead captures are passed to 1265 to a user-defined functor `fun` that returns a new string 1266 to use as replacement. 1267 1268 This version replaces the first match in `input`, 1269 see $(LREF replaceAll) to replace the all of the matches. 1270 1271 Returns: 1272 A new string of the same type as `input` with all matches 1273 replaced by return values of `fun`. If no matches found 1274 returns the `input` itself. 1275 +/ 1276 public R replaceFirst(alias fun, R, RegEx)(R input, RegEx re) 1277 if (isSomeString!R && isRegexFor!(RegEx, R)) 1278 { 1279 return replaceFirstWith!((m, sink) => sink.put(fun(m)))(input, re); 1280 } 1281 1282 /// 1283 @system unittest 1284 { 1285 import std.conv : to; 1286 string list = "#21 out of 46"; 1287 string newList = replaceFirst!(cap => to!string(to!int(cap.hit)+1)) 1288 (list, regex(`[0-9]+`)); 1289 assert(newList == "#22 out of 46"); 1290 } 1291 1292 /++ 1293 A variation on $(LREF replaceFirst) that instead of allocating a new string 1294 on each call outputs the result piece-wise to the `sink`. In particular 1295 this enables efficient construction of a final output incrementally. 1296 1297 Like in $(LREF replaceFirst) family of functions there is an overload 1298 for the substitution guided by the `format` string 1299 and the one with the user defined callback. 1300 +/ 1301 public @trusted void replaceFirstInto(Sink, R, C, RegEx) 1302 (ref Sink sink, R input, RegEx re, const(C)[] format) 1303 if (isOutputRange!(Sink, dchar) && isSomeString!R 1304 && is(C : dchar) && isRegexFor!(RegEx, R)) 1305 { 1306 replaceCapturesInto!((m, sink) => replaceFmt(format, m, sink)) 1307 (sink, input, matchFirst(input, re)); 1308 } 1309 1310 ///ditto 1311 public @trusted void replaceFirstInto(alias fun, Sink, R, RegEx) 1312 (Sink sink, R input, RegEx re) 1313 if (isOutputRange!(Sink, dchar) && isSomeString!R && isRegexFor!(RegEx, R)) 1314 { 1315 replaceCapturesInto!fun(sink, input, matchFirst(input, re)); 1316 } 1317 1318 /// 1319 @system unittest 1320 { 1321 import std.array; 1322 string m1 = "first message\n"; 1323 string m2 = "second message\n"; 1324 auto result = appender!string(); 1325 replaceFirstInto(result, m1, regex(`([a-z]+) message`), "$1"); 1326 //equivalent of the above with user-defined callback 1327 replaceFirstInto!(cap=>cap[1])(result, m2, regex(`([a-z]+) message`)); 1328 assert(result.data == "first\nsecond\n"); 1329 } 1330 1331 //examples for replaceFirst 1332 @system unittest 1333 { 1334 import std.conv; 1335 string list = "#21 out of 46"; 1336 string newList = replaceFirst!(cap => to!string(to!int(cap.hit)+1)) 1337 (list, regex(`[0-9]+`)); 1338 assert(newList == "#22 out of 46"); 1339 import std.array; 1340 string m1 = "first message\n"; 1341 string m2 = "second message\n"; 1342 auto result = appender!string(); 1343 replaceFirstInto(result, m1, regex(`([a-z]+) message`), "$1"); 1344 //equivalent of the above with user-defined callback 1345 replaceFirstInto!(cap=>cap[1])(result, m2, regex(`([a-z]+) message`)); 1346 assert(result.data == "first\nsecond\n"); 1347 } 1348 1349 /++ 1350 Construct a new string from `input` by replacing all of the 1351 fragments that match a pattern `re` with a string generated 1352 from the match according to the `format` specifier. 1353 1354 To replace only the first match use $(LREF replaceFirst). 1355 1356 Params: 1357 input = string to search 1358 re = compiled regular expression to use 1359 format = _format string to generate replacements from, 1360 see $(S_LINK Replace _format string, the _format string). 1361 1362 Returns: 1363 A string of the same type as `input` with the all 1364 of the matches (if any) replaced. 1365 If no match is found returns the input string itself. 1366 +/ 1367 public @trusted R replaceAll(R, C, RegEx)(R input, RegEx re, const(C)[] format) 1368 if (isSomeString!R && is(C : dchar) && isRegexFor!(RegEx, R)) 1369 { 1370 return replaceAllWith!((m, sink) => replaceFmt(format, m, sink))(input, re); 1371 } 1372 1373 /// 1374 @system unittest 1375 { 1376 // insert comma as thousands delimiter 1377 auto re = regex(r"(?<=\d)(?=(\d\d\d)+\b)","g"); 1378 assert(replaceAll("12000 + 42100 = 54100", re, ",") == "12,000 + 42,100 = 54,100"); 1379 } 1380 1381 /++ 1382 This is a general replacement tool that construct a new string by replacing 1383 matches of pattern `re` in the `input`. Unlike the other overload 1384 there is no format string instead captures are passed to 1385 to a user-defined functor `fun` that returns a new string 1386 to use as replacement. 1387 1388 This version replaces all of the matches found in `input`, 1389 see $(LREF replaceFirst) to replace the first match only. 1390 1391 Returns: 1392 A new string of the same type as `input` with all matches 1393 replaced by return values of `fun`. If no matches found 1394 returns the `input` itself. 1395 1396 Params: 1397 input = string to search 1398 re = compiled regular expression 1399 fun = delegate to use 1400 +/ 1401 public @trusted R replaceAll(alias fun, R, RegEx)(R input, RegEx re) 1402 if (isSomeString!R && isRegexFor!(RegEx, R)) 1403 { 1404 return replaceAllWith!((m, sink) => sink.put(fun(m)))(input, re); 1405 } 1406 1407 /// 1408 @system unittest 1409 { 1410 string baz(Captures!(string) m) 1411 { 1412 import std.string : toUpper; 1413 return toUpper(m.hit); 1414 } 1415 // Capitalize the letters 'a' and 'r': 1416 auto s = replaceAll!(baz)("Strap a rocket engine on a chicken.", 1417 regex("[ar]")); 1418 assert(s == "StRAp A Rocket engine on A chicken."); 1419 } 1420 1421 /++ 1422 A variation on $(LREF replaceAll) that instead of allocating a new string 1423 on each call outputs the result piece-wise to the `sink`. In particular 1424 this enables efficient construction of a final output incrementally. 1425 1426 As with $(LREF replaceAll) there are 2 overloads - one with a format string, 1427 the other one with a user defined functor. 1428 +/ 1429 public @trusted void replaceAllInto(Sink, R, C, RegEx) 1430 (Sink sink, R input, RegEx re, const(C)[] format) 1431 if (isOutputRange!(Sink, dchar) && isSomeString!R 1432 && is(C : dchar) && isRegexFor!(RegEx, R)) 1433 { 1434 replaceMatchesInto!((m, sink) => replaceFmt(format, m, sink)) 1435 (sink, input, matchAll(input, re)); 1436 } 1437 1438 ///ditto 1439 public @trusted void replaceAllInto(alias fun, Sink, R, RegEx) 1440 (Sink sink, R input, RegEx re) 1441 if (isOutputRange!(Sink, dchar) && isSomeString!R && isRegexFor!(RegEx, R)) 1442 { 1443 replaceMatchesInto!fun(sink, input, matchAll(input, re)); 1444 } 1445 1446 /// 1447 @system unittest 1448 { 1449 // insert comma as thousands delimiter in fifty randomly produced big numbers 1450 import std.array, std.conv, std.random, std.range; 1451 static re = regex(`(?<=\d)(?=(\d\d\d)+\b)`, "g"); 1452 auto sink = appender!(char [])(); 1453 enum ulong min = 10UL ^^ 10, max = 10UL ^^ 19; 1454 foreach (i; 0 .. 50) 1455 { 1456 sink.clear(); 1457 replaceAllInto(sink, text(uniform(min, max)), re, ","); 1458 foreach (pos; iota(sink.data.length - 4, 0, -4)) 1459 assert(sink.data[pos] == ','); 1460 } 1461 } 1462 1463 // exercise all of the replace APIs 1464 @system unittest 1465 { 1466 import std.array : appender; 1467 import std.conv; 1468 // try and check first/all simple substitution 1469 static foreach (S; AliasSeq!(string, wstring, dstring, char[], wchar[], dchar[])) 1470 {{ 1471 S s1 = "curt trial".to!S(); 1472 S s2 = "round dome".to!S(); 1473 S t1F = "court trial".to!S(); 1474 S t2F = "hound dome".to!S(); 1475 S t1A = "court trial".to!S(); 1476 S t2A = "hound home".to!S(); 1477 auto re1 = regex("curt".to!S()); 1478 auto re2 = regex("[dr]o".to!S()); 1479 1480 assert(replaceFirst(s1, re1, "court") == t1F); 1481 assert(replaceFirst(s2, re2, "ho") == t2F); 1482 assert(replaceAll(s1, re1, "court") == t1A); 1483 assert(replaceAll(s2, re2, "ho") == t2A); 1484 1485 auto rep1 = replaceFirst!(cap => cap[0][0]~"o".to!S()~cap[0][1..$])(s1, re1); 1486 assert(rep1 == t1F); 1487 assert(replaceFirst!(cap => "ho".to!S())(s2, re2) == t2F); 1488 auto rep1A = replaceAll!(cap => cap[0][0]~"o".to!S()~cap[0][1..$])(s1, re1); 1489 assert(rep1A == t1A); 1490 assert(replaceAll!(cap => "ho".to!S())(s2, re2) == t2A); 1491 1492 auto sink = appender!S(); 1493 replaceFirstInto(sink, s1, re1, "court"); 1494 assert(sink.data == t1F); 1495 replaceFirstInto(sink, s2, re2, "ho"); 1496 assert(sink.data == t1F~t2F); 1497 replaceAllInto(sink, s1, re1, "court"); 1498 assert(sink.data == t1F~t2F~t1A); 1499 replaceAllInto(sink, s2, re2, "ho"); 1500 assert(sink.data == t1F~t2F~t1A~t2A); 1501 }} 1502 } 1503 1504 /++ 1505 Old API for replacement, operation depends on flags of pattern `re`. 1506 With "g" flag it performs the equivalent of $(LREF replaceAll) otherwise it 1507 works the same as $(LREF replaceFirst). 1508 1509 The use of this function is $(RED discouraged), please use $(LREF replaceAll) 1510 or $(LREF replaceFirst) explicitly. 1511 +/ 1512 public R replace(alias scheme = match, R, C, RegEx)(R input, RegEx re, const(C)[] format) 1513 if (isSomeString!R && isRegexFor!(RegEx, R)) 1514 { 1515 return replaceAllWith!((m, sink) => replaceFmt(format, m, sink), match)(input, re); 1516 } 1517 1518 ///ditto 1519 public R replace(alias fun, R, RegEx)(R input, RegEx re) 1520 if (isSomeString!R && isRegexFor!(RegEx, R)) 1521 { 1522 return replaceAllWith!(fun, match)(input, re); 1523 } 1524 1525 /** 1526 Splits a string `r` using a regular expression `pat` as a separator. 1527 1528 Params: 1529 keepSeparators = flag to specify if the matches should be in the resulting range 1530 r = the string to split 1531 pat = the pattern to split on 1532 Returns: 1533 A lazy range of strings 1534 */ 1535 public struct Splitter(Flag!"keepSeparators" keepSeparators = No.keepSeparators, Range, alias RegEx = Regex) 1536 if (isSomeString!Range && isRegexFor!(RegEx, Range)) 1537 { 1538 private: 1539 Range _input; 1540 size_t _offset; 1541 alias Rx = typeof(match(Range.init,RegEx.init)); 1542 Rx _match; 1543 1544 static if (keepSeparators) bool onMatch = false; 1545 thisSplitter1546 @trusted this(Range input, RegEx separator) 1547 {//@@@BUG@@@ generated opAssign of RegexMatch is not @trusted 1548 _input = input; 1549 const re = separator.withFlags(separator.flags | RegexOption.global); 1550 if (_input.empty) 1551 { 1552 //there is nothing to match at all, make _offset > 0 1553 _offset = 1; 1554 } 1555 else 1556 { 1557 _match = Rx(_input, re); 1558 1559 static if (keepSeparators) 1560 if (_match.pre.empty) 1561 popFront(); 1562 } 1563 } 1564 1565 public: 1566 auto ref opSlice() 1567 { 1568 return this.save; 1569 } 1570 1571 ///Forward range primitives. 1572 @property Range front() 1573 { 1574 import std.algorithm.comparison : min; 1575 1576 assert(!empty && _offset <= _match.pre.length 1577 && _match.pre.length <= _input.length); 1578 1579 static if (keepSeparators) 1580 { 1581 if (!onMatch) 1582 return _input[_offset .. min($, _match.pre.length)]; 1583 else 1584 return _match.hit(); 1585 } 1586 else 1587 { 1588 return _input[_offset .. min($, _match.pre.length)]; 1589 } 1590 } 1591 1592 ///ditto 1593 @property bool empty() 1594 { 1595 static if (keepSeparators) 1596 return _offset >= _input.length; 1597 else 1598 return _offset > _input.length; 1599 } 1600 1601 ///ditto 1602 void popFront() 1603 { 1604 assert(!empty); 1605 if (_match.empty) 1606 { 1607 //No more separators, work is done here 1608 _offset = _input.length + 1; 1609 } 1610 else 1611 { 1612 static if (keepSeparators) 1613 { 1614 if (!onMatch) 1615 { 1616 //skip past the separator 1617 _offset = _match.pre.length; 1618 } 1619 else 1620 { 1621 _offset += _match.hit.length; 1622 _match.popFront(); 1623 } 1624 1625 onMatch = !onMatch; 1626 } 1627 else 1628 { 1629 //skip past the separator 1630 _offset = _match.pre.length + _match.hit.length; 1631 _match.popFront(); 1632 } 1633 } 1634 } 1635 1636 ///ditto 1637 @property auto save() 1638 { 1639 return this; 1640 } 1641 } 1642 1643 /// ditto 1644 public Splitter!(keepSeparators, Range, RegEx) splitter( 1645 Flag!"keepSeparators" keepSeparators = No.keepSeparators, Range, RegEx)(Range r, RegEx pat) 1646 if ( 1647 is(BasicElementOf!Range : dchar) && isRegexFor!(RegEx, Range)) 1648 { 1649 return Splitter!(keepSeparators, Range, RegEx)(r, pat); 1650 } 1651 1652 /// 1653 @system unittest 1654 { 1655 import std.algorithm.comparison : equal; 1656 auto s1 = ", abc, de, fg, hi, "; 1657 assert(equal(splitter(s1, regex(", *")), 1658 ["", "abc", "de", "fg", "hi", ""])); 1659 } 1660 1661 /// Split on a pattern, but keep the matches in the resulting range 1662 @system unittest 1663 { 1664 import std.algorithm.comparison : equal; 1665 import std.typecons : Yes; 1666 1667 auto pattern = regex(`([\.,])`); 1668 1669 assert("2003.04.05" 1670 .splitter!(Yes.keepSeparators)(pattern) 1671 .equal(["2003", ".", "04", ".", "05"])); 1672 1673 assert(",1,2,3" 1674 .splitter!(Yes.keepSeparators)(pattern) 1675 .equal([",", "1", ",", "2", ",", "3"])); 1676 } 1677 1678 ///An eager version of `splitter` that creates an array with splitted slices of `input`. 1679 public @trusted String[] split(String, RegEx)(String input, RegEx rx) 1680 if (isSomeString!String && isRegexFor!(RegEx, String)) 1681 { 1682 import std.array : appender; 1683 auto a = appender!(String[])(); 1684 foreach (e; splitter(input, rx)) 1685 a.put(e); 1686 return a.data; 1687 } 1688 1689 ///Exception object thrown in case of errors during regex compilation. 1690 public alias RegexException = std.regex.internal.ir.RegexException; 1691 1692 /++ 1693 A range that lazily produces a string output escaped 1694 to be used inside of a regular expression. 1695 +/ 1696 auto escaper(Range)(Range r) 1697 { 1698 import std.algorithm.searching : find; 1699 static immutable escapables = [Escapables]; 1700 static struct Escaper // template to deduce attributes 1701 { 1702 Range r; 1703 bool escaped; 1704 1705 @property ElementType!Range front(){ 1706 if (escaped) 1707 return '\\'; 1708 else 1709 return r.front; 1710 } 1711 1712 @property bool empty(){ return r.empty; } 1713 1714 void popFront(){ 1715 if (escaped) escaped = false; 1716 else 1717 { 1718 r.popFront(); 1719 if (!r.empty && !escapables.find(r.front).empty) 1720 escaped = true; 1721 } 1722 } 1723 1724 @property auto save(){ return Escaper(r.save, escaped); } 1725 } 1726 1727 bool escaped = !r.empty && !escapables.find(r.front).empty; 1728 return Escaper(r, escaped); 1729 } 1730 1731 /// 1732 @system unittest 1733 { 1734 import std.algorithm.comparison; 1735 import std.regex; 1736 string s = `This is {unfriendly} to *regex*`; 1737 assert(s.escaper.equal(`This is \{unfriendly\} to \*regex\*`)); 1738 } 1739 1740 @system unittest 1741 { 1742 import std.algorithm.comparison; 1743 import std.conv; 1744 static foreach (S; AliasSeq!(string, wstring, dstring)) 1745 {{ 1746 auto s = "^".to!S; 1747 assert(s.escaper.equal(`\^`)); 1748 auto s2 = ""; 1749 assert(s2.escaper.equal("")); 1750 }} 1751 } 1752 1753 @system unittest 1754 { 1755 assert("ab".matchFirst(regex(`a?b?`)).hit == "ab"); 1756 assert("ab".matchFirst(regex(`a??b?`)).hit == ""); 1757 } 1758