10b57cec5SDimitry Andric //===-- Regex.cpp - Regular Expression matcher implementation -------------===//
20b57cec5SDimitry Andric //
30b57cec5SDimitry Andric // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
40b57cec5SDimitry Andric // See https://llvm.org/LICENSE.txt for license information.
50b57cec5SDimitry Andric // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
60b57cec5SDimitry Andric //
70b57cec5SDimitry Andric //===----------------------------------------------------------------------===//
80b57cec5SDimitry Andric //
90b57cec5SDimitry Andric // This file implements a POSIX regular expression matcher.
100b57cec5SDimitry Andric //
110b57cec5SDimitry Andric //===----------------------------------------------------------------------===//
120b57cec5SDimitry Andric
130b57cec5SDimitry Andric #include "llvm/Support/Regex.h"
140b57cec5SDimitry Andric #include "llvm/ADT/SmallVector.h"
150b57cec5SDimitry Andric #include "llvm/ADT/StringRef.h"
160b57cec5SDimitry Andric #include "llvm/ADT/Twine.h"
1706c3fb27SDimitry Andric #include "regex_impl.h"
1806c3fb27SDimitry Andric
195ffd83dbSDimitry Andric #include <cassert>
200b57cec5SDimitry Andric #include <string>
210b57cec5SDimitry Andric
220b57cec5SDimitry Andric using namespace llvm;
230b57cec5SDimitry Andric
Regex()240b57cec5SDimitry Andric Regex::Regex() : preg(nullptr), error(REG_BADPAT) {}
250b57cec5SDimitry Andric
Regex(StringRef regex,RegexFlags Flags)265ffd83dbSDimitry Andric Regex::Regex(StringRef regex, RegexFlags Flags) {
270b57cec5SDimitry Andric unsigned flags = 0;
280b57cec5SDimitry Andric preg = new llvm_regex();
290b57cec5SDimitry Andric preg->re_endp = regex.end();
300b57cec5SDimitry Andric if (Flags & IgnoreCase)
310b57cec5SDimitry Andric flags |= REG_ICASE;
320b57cec5SDimitry Andric if (Flags & Newline)
330b57cec5SDimitry Andric flags |= REG_NEWLINE;
340b57cec5SDimitry Andric if (!(Flags & BasicRegex))
350b57cec5SDimitry Andric flags |= REG_EXTENDED;
360b57cec5SDimitry Andric error = llvm_regcomp(preg, regex.data(), flags|REG_PEND);
370b57cec5SDimitry Andric }
380b57cec5SDimitry Andric
Regex(StringRef regex,unsigned Flags)395ffd83dbSDimitry Andric Regex::Regex(StringRef regex, unsigned Flags)
405ffd83dbSDimitry Andric : Regex(regex, static_cast<RegexFlags>(Flags)) {}
415ffd83dbSDimitry Andric
Regex(Regex && regex)420b57cec5SDimitry Andric Regex::Regex(Regex &®ex) {
430b57cec5SDimitry Andric preg = regex.preg;
440b57cec5SDimitry Andric error = regex.error;
450b57cec5SDimitry Andric regex.preg = nullptr;
460b57cec5SDimitry Andric regex.error = REG_BADPAT;
470b57cec5SDimitry Andric }
480b57cec5SDimitry Andric
~Regex()490b57cec5SDimitry Andric Regex::~Regex() {
500b57cec5SDimitry Andric if (preg) {
510b57cec5SDimitry Andric llvm_regfree(preg);
520b57cec5SDimitry Andric delete preg;
530b57cec5SDimitry Andric }
540b57cec5SDimitry Andric }
550b57cec5SDimitry Andric
568bcb0991SDimitry Andric namespace {
570b57cec5SDimitry Andric
588bcb0991SDimitry Andric /// Utility to convert a regex error code into a human-readable string.
RegexErrorToString(int error,struct llvm_regex * preg,std::string & Error)598bcb0991SDimitry Andric void RegexErrorToString(int error, struct llvm_regex *preg,
608bcb0991SDimitry Andric std::string &Error) {
610b57cec5SDimitry Andric size_t len = llvm_regerror(error, preg, nullptr, 0);
620b57cec5SDimitry Andric
630b57cec5SDimitry Andric Error.resize(len - 1);
640b57cec5SDimitry Andric llvm_regerror(error, preg, &Error[0], len);
658bcb0991SDimitry Andric }
668bcb0991SDimitry Andric
678bcb0991SDimitry Andric } // namespace
688bcb0991SDimitry Andric
isValid(std::string & Error) const698bcb0991SDimitry Andric bool Regex::isValid(std::string &Error) const {
708bcb0991SDimitry Andric if (!error)
718bcb0991SDimitry Andric return true;
728bcb0991SDimitry Andric
738bcb0991SDimitry Andric RegexErrorToString(error, preg, Error);
740b57cec5SDimitry Andric return false;
750b57cec5SDimitry Andric }
760b57cec5SDimitry Andric
770b57cec5SDimitry Andric /// getNumMatches - In a valid regex, return the number of parenthesized
780b57cec5SDimitry Andric /// matches it contains.
getNumMatches() const790b57cec5SDimitry Andric unsigned Regex::getNumMatches() const {
800b57cec5SDimitry Andric return preg->re_nsub;
810b57cec5SDimitry Andric }
820b57cec5SDimitry Andric
match(StringRef String,SmallVectorImpl<StringRef> * Matches,std::string * Error) const838bcb0991SDimitry Andric bool Regex::match(StringRef String, SmallVectorImpl<StringRef> *Matches,
848bcb0991SDimitry Andric std::string *Error) const {
858bcb0991SDimitry Andric // Reset error, if given.
868bcb0991SDimitry Andric if (Error && !Error->empty())
878bcb0991SDimitry Andric *Error = "";
888bcb0991SDimitry Andric
898bcb0991SDimitry Andric // Check if the regex itself didn't successfully compile.
908bcb0991SDimitry Andric if (Error ? !isValid(*Error) : !isValid())
910b57cec5SDimitry Andric return false;
920b57cec5SDimitry Andric
930b57cec5SDimitry Andric unsigned nmatch = Matches ? preg->re_nsub+1 : 0;
940b57cec5SDimitry Andric
95*5f757f3fSDimitry Andric // Update null string to empty string.
96*5f757f3fSDimitry Andric if (String.data() == nullptr)
97*5f757f3fSDimitry Andric String = "";
98*5f757f3fSDimitry Andric
990b57cec5SDimitry Andric // pmatch needs to have at least one element.
1000b57cec5SDimitry Andric SmallVector<llvm_regmatch_t, 8> pm;
1010b57cec5SDimitry Andric pm.resize(nmatch > 0 ? nmatch : 1);
1020b57cec5SDimitry Andric pm[0].rm_so = 0;
1030b57cec5SDimitry Andric pm[0].rm_eo = String.size();
1040b57cec5SDimitry Andric
1050b57cec5SDimitry Andric int rc = llvm_regexec(preg, String.data(), nmatch, pm.data(), REG_STARTEND);
1060b57cec5SDimitry Andric
1078bcb0991SDimitry Andric // Failure to match is not an error, it's just a normal return value.
1088bcb0991SDimitry Andric // Any other error code is considered abnormal, and is logged in the Error.
1090b57cec5SDimitry Andric if (rc == REG_NOMATCH)
1100b57cec5SDimitry Andric return false;
1110b57cec5SDimitry Andric if (rc != 0) {
1128bcb0991SDimitry Andric if (Error)
1138bcb0991SDimitry Andric RegexErrorToString(error, preg, *Error);
1140b57cec5SDimitry Andric return false;
1150b57cec5SDimitry Andric }
1160b57cec5SDimitry Andric
1170b57cec5SDimitry Andric // There was a match.
1180b57cec5SDimitry Andric
1190b57cec5SDimitry Andric if (Matches) { // match position requested
1200b57cec5SDimitry Andric Matches->clear();
1210b57cec5SDimitry Andric
1220b57cec5SDimitry Andric for (unsigned i = 0; i != nmatch; ++i) {
1230b57cec5SDimitry Andric if (pm[i].rm_so == -1) {
1240b57cec5SDimitry Andric // this group didn't match
1250b57cec5SDimitry Andric Matches->push_back(StringRef());
1260b57cec5SDimitry Andric continue;
1270b57cec5SDimitry Andric }
1280b57cec5SDimitry Andric assert(pm[i].rm_eo >= pm[i].rm_so);
1290b57cec5SDimitry Andric Matches->push_back(StringRef(String.data()+pm[i].rm_so,
1300b57cec5SDimitry Andric pm[i].rm_eo-pm[i].rm_so));
1310b57cec5SDimitry Andric }
1320b57cec5SDimitry Andric }
1330b57cec5SDimitry Andric
1340b57cec5SDimitry Andric return true;
1350b57cec5SDimitry Andric }
1360b57cec5SDimitry Andric
sub(StringRef Repl,StringRef String,std::string * Error) const1370b57cec5SDimitry Andric std::string Regex::sub(StringRef Repl, StringRef String,
1388bcb0991SDimitry Andric std::string *Error) const {
1390b57cec5SDimitry Andric SmallVector<StringRef, 8> Matches;
1400b57cec5SDimitry Andric
1410b57cec5SDimitry Andric // Return the input if there was no match.
1428bcb0991SDimitry Andric if (!match(String, &Matches, Error))
1435ffd83dbSDimitry Andric return std::string(String);
1440b57cec5SDimitry Andric
1450b57cec5SDimitry Andric // Otherwise splice in the replacement string, starting with the prefix before
1460b57cec5SDimitry Andric // the match.
1470b57cec5SDimitry Andric std::string Res(String.begin(), Matches[0].begin());
1480b57cec5SDimitry Andric
1490b57cec5SDimitry Andric // Then the replacement string, honoring possible substitutions.
1500b57cec5SDimitry Andric while (!Repl.empty()) {
1510b57cec5SDimitry Andric // Skip to the next escape.
1520b57cec5SDimitry Andric std::pair<StringRef, StringRef> Split = Repl.split('\\');
1530b57cec5SDimitry Andric
1540b57cec5SDimitry Andric // Add the skipped substring.
1550b57cec5SDimitry Andric Res += Split.first;
1560b57cec5SDimitry Andric
1570b57cec5SDimitry Andric // Check for terminimation and trailing backslash.
1580b57cec5SDimitry Andric if (Split.second.empty()) {
1590b57cec5SDimitry Andric if (Repl.size() != Split.first.size() &&
1600b57cec5SDimitry Andric Error && Error->empty())
1610b57cec5SDimitry Andric *Error = "replacement string contained trailing backslash";
1620b57cec5SDimitry Andric break;
1630b57cec5SDimitry Andric }
1640b57cec5SDimitry Andric
1650b57cec5SDimitry Andric // Otherwise update the replacement string and interpret escapes.
1660b57cec5SDimitry Andric Repl = Split.second;
1670b57cec5SDimitry Andric
1680b57cec5SDimitry Andric // FIXME: We should have a StringExtras function for mapping C99 escapes.
1690b57cec5SDimitry Andric switch (Repl[0]) {
170*5f757f3fSDimitry Andric
171*5f757f3fSDimitry Andric // Backreference with the "\g<ref>" syntax
172*5f757f3fSDimitry Andric case 'g':
173*5f757f3fSDimitry Andric if (Repl.size() >= 4 && Repl[1] == '<') {
174*5f757f3fSDimitry Andric size_t End = Repl.find('>');
175*5f757f3fSDimitry Andric StringRef Ref = Repl.slice(2, End);
176*5f757f3fSDimitry Andric unsigned RefValue;
177*5f757f3fSDimitry Andric if (End != StringRef::npos && !Ref.getAsInteger(10, RefValue)) {
178*5f757f3fSDimitry Andric Repl = Repl.substr(End + 1);
179*5f757f3fSDimitry Andric if (RefValue < Matches.size())
180*5f757f3fSDimitry Andric Res += Matches[RefValue];
181*5f757f3fSDimitry Andric else if (Error && Error->empty())
182*5f757f3fSDimitry Andric *Error =
183*5f757f3fSDimitry Andric ("invalid backreference string 'g<" + Twine(Ref) + ">'").str();
184*5f757f3fSDimitry Andric break;
185*5f757f3fSDimitry Andric }
186*5f757f3fSDimitry Andric }
187*5f757f3fSDimitry Andric [[fallthrough]];
188*5f757f3fSDimitry Andric
1890b57cec5SDimitry Andric // Treat all unrecognized characters as self-quoting.
1900b57cec5SDimitry Andric default:
1910b57cec5SDimitry Andric Res += Repl[0];
1920b57cec5SDimitry Andric Repl = Repl.substr(1);
1930b57cec5SDimitry Andric break;
1940b57cec5SDimitry Andric
1950b57cec5SDimitry Andric // Single character escapes.
1960b57cec5SDimitry Andric case 't':
1970b57cec5SDimitry Andric Res += '\t';
1980b57cec5SDimitry Andric Repl = Repl.substr(1);
1990b57cec5SDimitry Andric break;
2000b57cec5SDimitry Andric case 'n':
2010b57cec5SDimitry Andric Res += '\n';
2020b57cec5SDimitry Andric Repl = Repl.substr(1);
2030b57cec5SDimitry Andric break;
2040b57cec5SDimitry Andric
2050b57cec5SDimitry Andric // Decimal escapes are backreferences.
2060b57cec5SDimitry Andric case '0': case '1': case '2': case '3': case '4':
2070b57cec5SDimitry Andric case '5': case '6': case '7': case '8': case '9': {
2080b57cec5SDimitry Andric // Extract the backreference number.
2090b57cec5SDimitry Andric StringRef Ref = Repl.slice(0, Repl.find_first_not_of("0123456789"));
2100b57cec5SDimitry Andric Repl = Repl.substr(Ref.size());
2110b57cec5SDimitry Andric
2120b57cec5SDimitry Andric unsigned RefValue;
2130b57cec5SDimitry Andric if (!Ref.getAsInteger(10, RefValue) &&
2140b57cec5SDimitry Andric RefValue < Matches.size())
2150b57cec5SDimitry Andric Res += Matches[RefValue];
2160b57cec5SDimitry Andric else if (Error && Error->empty())
2170b57cec5SDimitry Andric *Error = ("invalid backreference string '" + Twine(Ref) + "'").str();
2180b57cec5SDimitry Andric break;
2190b57cec5SDimitry Andric }
2200b57cec5SDimitry Andric }
2210b57cec5SDimitry Andric }
2220b57cec5SDimitry Andric
2230b57cec5SDimitry Andric // And finally the suffix.
2240b57cec5SDimitry Andric Res += StringRef(Matches[0].end(), String.end() - Matches[0].end());
2250b57cec5SDimitry Andric
2260b57cec5SDimitry Andric return Res;
2270b57cec5SDimitry Andric }
2280b57cec5SDimitry Andric
2290b57cec5SDimitry Andric // These are the special characters matched in functions like "p_ere_exp".
2300b57cec5SDimitry Andric static const char RegexMetachars[] = "()^$|*+?.[]\\{}";
2310b57cec5SDimitry Andric
isLiteralERE(StringRef Str)2320b57cec5SDimitry Andric bool Regex::isLiteralERE(StringRef Str) {
2330b57cec5SDimitry Andric // Check for regex metacharacters. This list was derived from our regex
2340b57cec5SDimitry Andric // implementation in regcomp.c and double checked against the POSIX extended
2350b57cec5SDimitry Andric // regular expression specification.
2360b57cec5SDimitry Andric return Str.find_first_of(RegexMetachars) == StringRef::npos;
2370b57cec5SDimitry Andric }
2380b57cec5SDimitry Andric
escape(StringRef String)2390b57cec5SDimitry Andric std::string Regex::escape(StringRef String) {
2400b57cec5SDimitry Andric std::string RegexStr;
2414824e7fdSDimitry Andric for (char C : String) {
2424824e7fdSDimitry Andric if (strchr(RegexMetachars, C))
2430b57cec5SDimitry Andric RegexStr += '\\';
2444824e7fdSDimitry Andric RegexStr += C;
2450b57cec5SDimitry Andric }
2460b57cec5SDimitry Andric
2470b57cec5SDimitry Andric return RegexStr;
2480b57cec5SDimitry Andric }
249