1 //===-- Regex.cpp - Regular Expression matcher implementation -------------===// 2 // 3 // The LLVM Compiler Infrastructure 4 // 5 // This file is distributed under the University of Illinois Open Source 6 // License. See LICENSE.TXT for details. 7 // 8 //===----------------------------------------------------------------------===// 9 // 10 // This file implements a POSIX regular expression matcher. 11 // 12 //===----------------------------------------------------------------------===// 13 14 #include "llvm/Support/Regex.h" 15 #include "regex_impl.h" 16 #include "llvm/ADT/SmallVector.h" 17 #include "llvm/ADT/StringRef.h" 18 #include "llvm/ADT/Twine.h" 19 #include <string> 20 using namespace llvm; 21 22 Regex::Regex() : preg(nullptr), error(REG_BADPAT) {} 23 24 Regex::Regex(StringRef regex, unsigned Flags) { 25 unsigned flags = 0; 26 preg = new llvm_regex(); 27 preg->re_endp = regex.end(); 28 if (Flags & IgnoreCase) 29 flags |= REG_ICASE; 30 if (Flags & Newline) 31 flags |= REG_NEWLINE; 32 if (!(Flags & BasicRegex)) 33 flags |= REG_EXTENDED; 34 error = llvm_regcomp(preg, regex.data(), flags|REG_PEND); 35 } 36 37 Regex::~Regex() { 38 if (preg) { 39 llvm_regfree(preg); 40 delete preg; 41 } 42 } 43 44 bool Regex::isValid(std::string &Error) { 45 if (!error) 46 return true; 47 48 size_t len = llvm_regerror(error, preg, nullptr, 0); 49 50 Error.resize(len - 1); 51 llvm_regerror(error, preg, &Error[0], len); 52 return false; 53 } 54 55 /// getNumMatches - In a valid regex, return the number of parenthesized 56 /// matches it contains. 57 unsigned Regex::getNumMatches() const { 58 return preg->re_nsub; 59 } 60 61 bool Regex::match(StringRef String, SmallVectorImpl<StringRef> *Matches){ 62 unsigned nmatch = Matches ? preg->re_nsub+1 : 0; 63 64 // pmatch needs to have at least one element. 65 SmallVector<llvm_regmatch_t, 8> pm; 66 pm.resize(nmatch > 0 ? nmatch : 1); 67 pm[0].rm_so = 0; 68 pm[0].rm_eo = String.size(); 69 70 int rc = llvm_regexec(preg, String.data(), nmatch, pm.data(), REG_STARTEND); 71 72 if (rc == REG_NOMATCH) 73 return false; 74 if (rc != 0) { 75 // regexec can fail due to invalid pattern or running out of memory. 76 error = rc; 77 return false; 78 } 79 80 // There was a match. 81 82 if (Matches) { // match position requested 83 Matches->clear(); 84 85 for (unsigned i = 0; i != nmatch; ++i) { 86 if (pm[i].rm_so == -1) { 87 // this group didn't match 88 Matches->push_back(StringRef()); 89 continue; 90 } 91 assert(pm[i].rm_eo >= pm[i].rm_so); 92 Matches->push_back(StringRef(String.data()+pm[i].rm_so, 93 pm[i].rm_eo-pm[i].rm_so)); 94 } 95 } 96 97 return true; 98 } 99 100 std::string Regex::sub(StringRef Repl, StringRef String, 101 std::string *Error) { 102 SmallVector<StringRef, 8> Matches; 103 104 // Reset error, if given. 105 if (Error && !Error->empty()) *Error = ""; 106 107 // Return the input if there was no match. 108 if (!match(String, &Matches)) 109 return String; 110 111 // Otherwise splice in the replacement string, starting with the prefix before 112 // the match. 113 std::string Res(String.begin(), Matches[0].begin()); 114 115 // Then the replacement string, honoring possible substitutions. 116 while (!Repl.empty()) { 117 // Skip to the next escape. 118 std::pair<StringRef, StringRef> Split = Repl.split('\\'); 119 120 // Add the skipped substring. 121 Res += Split.first; 122 123 // Check for terminimation and trailing backslash. 124 if (Split.second.empty()) { 125 if (Repl.size() != Split.first.size() && 126 Error && Error->empty()) 127 *Error = "replacement string contained trailing backslash"; 128 break; 129 } 130 131 // Otherwise update the replacement string and interpret escapes. 132 Repl = Split.second; 133 134 // FIXME: We should have a StringExtras function for mapping C99 escapes. 135 switch (Repl[0]) { 136 // Treat all unrecognized characters as self-quoting. 137 default: 138 Res += Repl[0]; 139 Repl = Repl.substr(1); 140 break; 141 142 // Single character escapes. 143 case 't': 144 Res += '\t'; 145 Repl = Repl.substr(1); 146 break; 147 case 'n': 148 Res += '\n'; 149 Repl = Repl.substr(1); 150 break; 151 152 // Decimal escapes are backreferences. 153 case '0': case '1': case '2': case '3': case '4': 154 case '5': case '6': case '7': case '8': case '9': { 155 // Extract the backreference number. 156 StringRef Ref = Repl.slice(0, Repl.find_first_not_of("0123456789")); 157 Repl = Repl.substr(Ref.size()); 158 159 unsigned RefValue; 160 if (!Ref.getAsInteger(10, RefValue) && 161 RefValue < Matches.size()) 162 Res += Matches[RefValue]; 163 else if (Error && Error->empty()) 164 *Error = ("invalid backreference string '" + Twine(Ref) + "'").str(); 165 break; 166 } 167 } 168 } 169 170 // And finally the suffix. 171 Res += StringRef(Matches[0].end(), String.end() - Matches[0].end()); 172 173 return Res; 174 } 175 176 // These are the special characters matched in functions like "p_ere_exp". 177 static const char RegexMetachars[] = "()^$|*+?.[]\\{}"; 178 179 bool Regex::isLiteralERE(StringRef Str) { 180 // Check for regex metacharacters. This list was derived from our regex 181 // implementation in regcomp.c and double checked against the POSIX extended 182 // regular expression specification. 183 return Str.find_first_of(RegexMetachars) == StringRef::npos; 184 } 185 186 std::string Regex::escape(StringRef String) { 187 std::string RegexStr; 188 for (unsigned i = 0, e = String.size(); i != e; ++i) { 189 if (strchr(RegexMetachars, String[i])) 190 RegexStr += '\\'; 191 RegexStr += String[i]; 192 } 193 194 return RegexStr; 195 } 196