xref: /freebsd-src/contrib/llvm-project/llvm/lib/Support/Regex.cpp (revision 5f757f3ff9144b609b3c433dfd370cc6bdc191ad)
10b57cec5SDimitry Andric //===-- Regex.cpp - Regular Expression matcher implementation -------------===//
20b57cec5SDimitry Andric //
30b57cec5SDimitry Andric // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
40b57cec5SDimitry Andric // See https://llvm.org/LICENSE.txt for license information.
50b57cec5SDimitry Andric // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
60b57cec5SDimitry Andric //
70b57cec5SDimitry Andric //===----------------------------------------------------------------------===//
80b57cec5SDimitry Andric //
90b57cec5SDimitry Andric // This file implements a POSIX regular expression matcher.
100b57cec5SDimitry Andric //
110b57cec5SDimitry Andric //===----------------------------------------------------------------------===//
120b57cec5SDimitry Andric 
130b57cec5SDimitry Andric #include "llvm/Support/Regex.h"
140b57cec5SDimitry Andric #include "llvm/ADT/SmallVector.h"
150b57cec5SDimitry Andric #include "llvm/ADT/StringRef.h"
160b57cec5SDimitry Andric #include "llvm/ADT/Twine.h"
1706c3fb27SDimitry Andric #include "regex_impl.h"
1806c3fb27SDimitry Andric 
195ffd83dbSDimitry Andric #include <cassert>
200b57cec5SDimitry Andric #include <string>
210b57cec5SDimitry Andric 
220b57cec5SDimitry Andric using namespace llvm;
230b57cec5SDimitry Andric 
Regex()240b57cec5SDimitry Andric Regex::Regex() : preg(nullptr), error(REG_BADPAT) {}
250b57cec5SDimitry Andric 
Regex(StringRef regex,RegexFlags Flags)265ffd83dbSDimitry Andric Regex::Regex(StringRef regex, RegexFlags Flags) {
270b57cec5SDimitry Andric   unsigned flags = 0;
280b57cec5SDimitry Andric   preg = new llvm_regex();
290b57cec5SDimitry Andric   preg->re_endp = regex.end();
300b57cec5SDimitry Andric   if (Flags & IgnoreCase)
310b57cec5SDimitry Andric     flags |= REG_ICASE;
320b57cec5SDimitry Andric   if (Flags & Newline)
330b57cec5SDimitry Andric     flags |= REG_NEWLINE;
340b57cec5SDimitry Andric   if (!(Flags & BasicRegex))
350b57cec5SDimitry Andric     flags |= REG_EXTENDED;
360b57cec5SDimitry Andric   error = llvm_regcomp(preg, regex.data(), flags|REG_PEND);
370b57cec5SDimitry Andric }
380b57cec5SDimitry Andric 
Regex(StringRef regex,unsigned Flags)395ffd83dbSDimitry Andric Regex::Regex(StringRef regex, unsigned Flags)
405ffd83dbSDimitry Andric     : Regex(regex, static_cast<RegexFlags>(Flags)) {}
415ffd83dbSDimitry Andric 
Regex(Regex && regex)420b57cec5SDimitry Andric Regex::Regex(Regex &&regex) {
430b57cec5SDimitry Andric   preg = regex.preg;
440b57cec5SDimitry Andric   error = regex.error;
450b57cec5SDimitry Andric   regex.preg = nullptr;
460b57cec5SDimitry Andric   regex.error = REG_BADPAT;
470b57cec5SDimitry Andric }
480b57cec5SDimitry Andric 
~Regex()490b57cec5SDimitry Andric Regex::~Regex() {
500b57cec5SDimitry Andric   if (preg) {
510b57cec5SDimitry Andric     llvm_regfree(preg);
520b57cec5SDimitry Andric     delete preg;
530b57cec5SDimitry Andric   }
540b57cec5SDimitry Andric }
550b57cec5SDimitry Andric 
568bcb0991SDimitry Andric namespace {
570b57cec5SDimitry Andric 
588bcb0991SDimitry Andric /// Utility to convert a regex error code into a human-readable string.
RegexErrorToString(int error,struct llvm_regex * preg,std::string & Error)598bcb0991SDimitry Andric void RegexErrorToString(int error, struct llvm_regex *preg,
608bcb0991SDimitry Andric                         std::string &Error) {
610b57cec5SDimitry Andric   size_t len = llvm_regerror(error, preg, nullptr, 0);
620b57cec5SDimitry Andric 
630b57cec5SDimitry Andric   Error.resize(len - 1);
640b57cec5SDimitry Andric   llvm_regerror(error, preg, &Error[0], len);
658bcb0991SDimitry Andric }
668bcb0991SDimitry Andric 
678bcb0991SDimitry Andric } // namespace
688bcb0991SDimitry Andric 
isValid(std::string & Error) const698bcb0991SDimitry Andric bool Regex::isValid(std::string &Error) const {
708bcb0991SDimitry Andric   if (!error)
718bcb0991SDimitry Andric     return true;
728bcb0991SDimitry Andric 
738bcb0991SDimitry Andric   RegexErrorToString(error, preg, Error);
740b57cec5SDimitry Andric   return false;
750b57cec5SDimitry Andric }
760b57cec5SDimitry Andric 
770b57cec5SDimitry Andric /// getNumMatches - In a valid regex, return the number of parenthesized
780b57cec5SDimitry Andric /// matches it contains.
getNumMatches() const790b57cec5SDimitry Andric unsigned Regex::getNumMatches() const {
800b57cec5SDimitry Andric   return preg->re_nsub;
810b57cec5SDimitry Andric }
820b57cec5SDimitry Andric 
match(StringRef String,SmallVectorImpl<StringRef> * Matches,std::string * Error) const838bcb0991SDimitry Andric bool Regex::match(StringRef String, SmallVectorImpl<StringRef> *Matches,
848bcb0991SDimitry Andric                   std::string *Error) const {
858bcb0991SDimitry Andric   // Reset error, if given.
868bcb0991SDimitry Andric   if (Error && !Error->empty())
878bcb0991SDimitry Andric     *Error = "";
888bcb0991SDimitry Andric 
898bcb0991SDimitry Andric   // Check if the regex itself didn't successfully compile.
908bcb0991SDimitry Andric   if (Error ? !isValid(*Error) : !isValid())
910b57cec5SDimitry Andric     return false;
920b57cec5SDimitry Andric 
930b57cec5SDimitry Andric   unsigned nmatch = Matches ? preg->re_nsub+1 : 0;
940b57cec5SDimitry Andric 
95*5f757f3fSDimitry Andric   // Update null string to empty string.
96*5f757f3fSDimitry Andric   if (String.data() == nullptr)
97*5f757f3fSDimitry Andric     String = "";
98*5f757f3fSDimitry Andric 
990b57cec5SDimitry Andric   // pmatch needs to have at least one element.
1000b57cec5SDimitry Andric   SmallVector<llvm_regmatch_t, 8> pm;
1010b57cec5SDimitry Andric   pm.resize(nmatch > 0 ? nmatch : 1);
1020b57cec5SDimitry Andric   pm[0].rm_so = 0;
1030b57cec5SDimitry Andric   pm[0].rm_eo = String.size();
1040b57cec5SDimitry Andric 
1050b57cec5SDimitry Andric   int rc = llvm_regexec(preg, String.data(), nmatch, pm.data(), REG_STARTEND);
1060b57cec5SDimitry Andric 
1078bcb0991SDimitry Andric   // Failure to match is not an error, it's just a normal return value.
1088bcb0991SDimitry Andric   // Any other error code is considered abnormal, and is logged in the Error.
1090b57cec5SDimitry Andric   if (rc == REG_NOMATCH)
1100b57cec5SDimitry Andric     return false;
1110b57cec5SDimitry Andric   if (rc != 0) {
1128bcb0991SDimitry Andric     if (Error)
1138bcb0991SDimitry Andric       RegexErrorToString(error, preg, *Error);
1140b57cec5SDimitry Andric     return false;
1150b57cec5SDimitry Andric   }
1160b57cec5SDimitry Andric 
1170b57cec5SDimitry Andric   // There was a match.
1180b57cec5SDimitry Andric 
1190b57cec5SDimitry Andric   if (Matches) { // match position requested
1200b57cec5SDimitry Andric     Matches->clear();
1210b57cec5SDimitry Andric 
1220b57cec5SDimitry Andric     for (unsigned i = 0; i != nmatch; ++i) {
1230b57cec5SDimitry Andric       if (pm[i].rm_so == -1) {
1240b57cec5SDimitry Andric         // this group didn't match
1250b57cec5SDimitry Andric         Matches->push_back(StringRef());
1260b57cec5SDimitry Andric         continue;
1270b57cec5SDimitry Andric       }
1280b57cec5SDimitry Andric       assert(pm[i].rm_eo >= pm[i].rm_so);
1290b57cec5SDimitry Andric       Matches->push_back(StringRef(String.data()+pm[i].rm_so,
1300b57cec5SDimitry Andric                                    pm[i].rm_eo-pm[i].rm_so));
1310b57cec5SDimitry Andric     }
1320b57cec5SDimitry Andric   }
1330b57cec5SDimitry Andric 
1340b57cec5SDimitry Andric   return true;
1350b57cec5SDimitry Andric }
1360b57cec5SDimitry Andric 
sub(StringRef Repl,StringRef String,std::string * Error) const1370b57cec5SDimitry Andric std::string Regex::sub(StringRef Repl, StringRef String,
1388bcb0991SDimitry Andric                        std::string *Error) const {
1390b57cec5SDimitry Andric   SmallVector<StringRef, 8> Matches;
1400b57cec5SDimitry Andric 
1410b57cec5SDimitry Andric   // Return the input if there was no match.
1428bcb0991SDimitry Andric   if (!match(String, &Matches, Error))
1435ffd83dbSDimitry Andric     return std::string(String);
1440b57cec5SDimitry Andric 
1450b57cec5SDimitry Andric   // Otherwise splice in the replacement string, starting with the prefix before
1460b57cec5SDimitry Andric   // the match.
1470b57cec5SDimitry Andric   std::string Res(String.begin(), Matches[0].begin());
1480b57cec5SDimitry Andric 
1490b57cec5SDimitry Andric   // Then the replacement string, honoring possible substitutions.
1500b57cec5SDimitry Andric   while (!Repl.empty()) {
1510b57cec5SDimitry Andric     // Skip to the next escape.
1520b57cec5SDimitry Andric     std::pair<StringRef, StringRef> Split = Repl.split('\\');
1530b57cec5SDimitry Andric 
1540b57cec5SDimitry Andric     // Add the skipped substring.
1550b57cec5SDimitry Andric     Res += Split.first;
1560b57cec5SDimitry Andric 
1570b57cec5SDimitry Andric     // Check for terminimation and trailing backslash.
1580b57cec5SDimitry Andric     if (Split.second.empty()) {
1590b57cec5SDimitry Andric       if (Repl.size() != Split.first.size() &&
1600b57cec5SDimitry Andric           Error && Error->empty())
1610b57cec5SDimitry Andric         *Error = "replacement string contained trailing backslash";
1620b57cec5SDimitry Andric       break;
1630b57cec5SDimitry Andric     }
1640b57cec5SDimitry Andric 
1650b57cec5SDimitry Andric     // Otherwise update the replacement string and interpret escapes.
1660b57cec5SDimitry Andric     Repl = Split.second;
1670b57cec5SDimitry Andric 
1680b57cec5SDimitry Andric     // FIXME: We should have a StringExtras function for mapping C99 escapes.
1690b57cec5SDimitry Andric     switch (Repl[0]) {
170*5f757f3fSDimitry Andric 
171*5f757f3fSDimitry Andric       // Backreference with the "\g<ref>" syntax
172*5f757f3fSDimitry Andric     case 'g':
173*5f757f3fSDimitry Andric       if (Repl.size() >= 4 && Repl[1] == '<') {
174*5f757f3fSDimitry Andric         size_t End = Repl.find('>');
175*5f757f3fSDimitry Andric         StringRef Ref = Repl.slice(2, End);
176*5f757f3fSDimitry Andric         unsigned RefValue;
177*5f757f3fSDimitry Andric         if (End != StringRef::npos && !Ref.getAsInteger(10, RefValue)) {
178*5f757f3fSDimitry Andric           Repl = Repl.substr(End + 1);
179*5f757f3fSDimitry Andric           if (RefValue < Matches.size())
180*5f757f3fSDimitry Andric             Res += Matches[RefValue];
181*5f757f3fSDimitry Andric           else if (Error && Error->empty())
182*5f757f3fSDimitry Andric             *Error =
183*5f757f3fSDimitry Andric                 ("invalid backreference string 'g<" + Twine(Ref) + ">'").str();
184*5f757f3fSDimitry Andric           break;
185*5f757f3fSDimitry Andric         }
186*5f757f3fSDimitry Andric       }
187*5f757f3fSDimitry Andric       [[fallthrough]];
188*5f757f3fSDimitry Andric 
1890b57cec5SDimitry Andric       // Treat all unrecognized characters as self-quoting.
1900b57cec5SDimitry Andric     default:
1910b57cec5SDimitry Andric       Res += Repl[0];
1920b57cec5SDimitry Andric       Repl = Repl.substr(1);
1930b57cec5SDimitry Andric       break;
1940b57cec5SDimitry Andric 
1950b57cec5SDimitry Andric       // Single character escapes.
1960b57cec5SDimitry Andric     case 't':
1970b57cec5SDimitry Andric       Res += '\t';
1980b57cec5SDimitry Andric       Repl = Repl.substr(1);
1990b57cec5SDimitry Andric       break;
2000b57cec5SDimitry Andric     case 'n':
2010b57cec5SDimitry Andric       Res += '\n';
2020b57cec5SDimitry Andric       Repl = Repl.substr(1);
2030b57cec5SDimitry Andric       break;
2040b57cec5SDimitry Andric 
2050b57cec5SDimitry Andric       // Decimal escapes are backreferences.
2060b57cec5SDimitry Andric     case '0': case '1': case '2': case '3': case '4':
2070b57cec5SDimitry Andric     case '5': case '6': case '7': case '8': case '9': {
2080b57cec5SDimitry Andric       // Extract the backreference number.
2090b57cec5SDimitry Andric       StringRef Ref = Repl.slice(0, Repl.find_first_not_of("0123456789"));
2100b57cec5SDimitry Andric       Repl = Repl.substr(Ref.size());
2110b57cec5SDimitry Andric 
2120b57cec5SDimitry Andric       unsigned RefValue;
2130b57cec5SDimitry Andric       if (!Ref.getAsInteger(10, RefValue) &&
2140b57cec5SDimitry Andric           RefValue < Matches.size())
2150b57cec5SDimitry Andric         Res += Matches[RefValue];
2160b57cec5SDimitry Andric       else if (Error && Error->empty())
2170b57cec5SDimitry Andric         *Error = ("invalid backreference string '" + Twine(Ref) + "'").str();
2180b57cec5SDimitry Andric       break;
2190b57cec5SDimitry Andric     }
2200b57cec5SDimitry Andric     }
2210b57cec5SDimitry Andric   }
2220b57cec5SDimitry Andric 
2230b57cec5SDimitry Andric   // And finally the suffix.
2240b57cec5SDimitry Andric   Res += StringRef(Matches[0].end(), String.end() - Matches[0].end());
2250b57cec5SDimitry Andric 
2260b57cec5SDimitry Andric   return Res;
2270b57cec5SDimitry Andric }
2280b57cec5SDimitry Andric 
2290b57cec5SDimitry Andric // These are the special characters matched in functions like "p_ere_exp".
2300b57cec5SDimitry Andric static const char RegexMetachars[] = "()^$|*+?.[]\\{}";
2310b57cec5SDimitry Andric 
isLiteralERE(StringRef Str)2320b57cec5SDimitry Andric bool Regex::isLiteralERE(StringRef Str) {
2330b57cec5SDimitry Andric   // Check for regex metacharacters.  This list was derived from our regex
2340b57cec5SDimitry Andric   // implementation in regcomp.c and double checked against the POSIX extended
2350b57cec5SDimitry Andric   // regular expression specification.
2360b57cec5SDimitry Andric   return Str.find_first_of(RegexMetachars) == StringRef::npos;
2370b57cec5SDimitry Andric }
2380b57cec5SDimitry Andric 
escape(StringRef String)2390b57cec5SDimitry Andric std::string Regex::escape(StringRef String) {
2400b57cec5SDimitry Andric   std::string RegexStr;
2414824e7fdSDimitry Andric   for (char C : String) {
2424824e7fdSDimitry Andric     if (strchr(RegexMetachars, C))
2430b57cec5SDimitry Andric       RegexStr += '\\';
2444824e7fdSDimitry Andric     RegexStr += C;
2450b57cec5SDimitry Andric   }
2460b57cec5SDimitry Andric 
2470b57cec5SDimitry Andric   return RegexStr;
2480b57cec5SDimitry Andric }
249