10b57cec5SDimitry Andric //===-- GlobPattern.cpp - Glob pattern matcher implementation -------------===//
20b57cec5SDimitry Andric //
30b57cec5SDimitry Andric // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
40b57cec5SDimitry Andric // See https://llvm.org/LICENSE.txt for license information.
50b57cec5SDimitry Andric // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
60b57cec5SDimitry Andric //
70b57cec5SDimitry Andric //===----------------------------------------------------------------------===//
80b57cec5SDimitry Andric //
90b57cec5SDimitry Andric // This file implements a glob pattern matcher.
100b57cec5SDimitry Andric //
110b57cec5SDimitry Andric //===----------------------------------------------------------------------===//
120b57cec5SDimitry Andric
130b57cec5SDimitry Andric #include "llvm/Support/GlobPattern.h"
140b57cec5SDimitry Andric #include "llvm/ADT/StringRef.h"
150b57cec5SDimitry Andric #include "llvm/Support/Errc.h"
160b57cec5SDimitry Andric
170b57cec5SDimitry Andric using namespace llvm;
180b57cec5SDimitry Andric
190b57cec5SDimitry Andric // Expands character ranges and returns a bitmap.
200b57cec5SDimitry Andric // For example, "a-cf-hz" is expanded to "abcfghz".
expand(StringRef S,StringRef Original)210b57cec5SDimitry Andric static Expected<BitVector> expand(StringRef S, StringRef Original) {
220b57cec5SDimitry Andric BitVector BV(256, false);
230b57cec5SDimitry Andric
240b57cec5SDimitry Andric // Expand X-Y.
250b57cec5SDimitry Andric for (;;) {
260b57cec5SDimitry Andric if (S.size() < 3)
270b57cec5SDimitry Andric break;
280b57cec5SDimitry Andric
290b57cec5SDimitry Andric uint8_t Start = S[0];
300b57cec5SDimitry Andric uint8_t End = S[2];
310b57cec5SDimitry Andric
320b57cec5SDimitry Andric // If it doesn't start with something like X-Y,
330b57cec5SDimitry Andric // consume the first character and proceed.
340b57cec5SDimitry Andric if (S[1] != '-') {
350b57cec5SDimitry Andric BV[Start] = true;
360b57cec5SDimitry Andric S = S.substr(1);
370b57cec5SDimitry Andric continue;
380b57cec5SDimitry Andric }
390b57cec5SDimitry Andric
400b57cec5SDimitry Andric // It must be in the form of X-Y.
410b57cec5SDimitry Andric // Validate it and then interpret the range.
420b57cec5SDimitry Andric if (Start > End)
430b57cec5SDimitry Andric return make_error<StringError>("invalid glob pattern: " + Original,
440b57cec5SDimitry Andric errc::invalid_argument);
450b57cec5SDimitry Andric
460b57cec5SDimitry Andric for (int C = Start; C <= End; ++C)
470b57cec5SDimitry Andric BV[(uint8_t)C] = true;
480b57cec5SDimitry Andric S = S.substr(3);
490b57cec5SDimitry Andric }
500b57cec5SDimitry Andric
510b57cec5SDimitry Andric for (char C : S)
520b57cec5SDimitry Andric BV[(uint8_t)C] = true;
530b57cec5SDimitry Andric return BV;
540b57cec5SDimitry Andric }
550b57cec5SDimitry Andric
56*5f757f3fSDimitry Andric // Identify brace expansions in S and return the list of patterns they expand
57*5f757f3fSDimitry Andric // into.
58*5f757f3fSDimitry Andric static Expected<SmallVector<std::string, 1>>
parseBraceExpansions(StringRef S,std::optional<size_t> MaxSubPatterns)59*5f757f3fSDimitry Andric parseBraceExpansions(StringRef S, std::optional<size_t> MaxSubPatterns) {
60*5f757f3fSDimitry Andric SmallVector<std::string> SubPatterns = {S.str()};
61*5f757f3fSDimitry Andric if (!MaxSubPatterns || !S.contains('{'))
62*5f757f3fSDimitry Andric return std::move(SubPatterns);
63*5f757f3fSDimitry Andric
64*5f757f3fSDimitry Andric struct BraceExpansion {
65*5f757f3fSDimitry Andric size_t Start;
66*5f757f3fSDimitry Andric size_t Length;
67*5f757f3fSDimitry Andric SmallVector<StringRef, 2> Terms;
68*5f757f3fSDimitry Andric };
69*5f757f3fSDimitry Andric SmallVector<BraceExpansion, 0> BraceExpansions;
70*5f757f3fSDimitry Andric
71*5f757f3fSDimitry Andric BraceExpansion *CurrentBE = nullptr;
72*5f757f3fSDimitry Andric size_t TermBegin;
73*5f757f3fSDimitry Andric for (size_t I = 0, E = S.size(); I != E; ++I) {
74*5f757f3fSDimitry Andric if (S[I] == '[') {
75*5f757f3fSDimitry Andric I = S.find(']', I + 2);
76*5f757f3fSDimitry Andric if (I == std::string::npos)
77*5f757f3fSDimitry Andric return make_error<StringError>("invalid glob pattern, unmatched '['",
78*5f757f3fSDimitry Andric errc::invalid_argument);
79*5f757f3fSDimitry Andric } else if (S[I] == '{') {
80*5f757f3fSDimitry Andric if (CurrentBE)
81*5f757f3fSDimitry Andric return make_error<StringError>(
82*5f757f3fSDimitry Andric "nested brace expansions are not supported",
83*5f757f3fSDimitry Andric errc::invalid_argument);
84*5f757f3fSDimitry Andric CurrentBE = &BraceExpansions.emplace_back();
85*5f757f3fSDimitry Andric CurrentBE->Start = I;
86*5f757f3fSDimitry Andric TermBegin = I + 1;
87*5f757f3fSDimitry Andric } else if (S[I] == ',') {
88*5f757f3fSDimitry Andric if (!CurrentBE)
89*5f757f3fSDimitry Andric continue;
90*5f757f3fSDimitry Andric CurrentBE->Terms.push_back(S.substr(TermBegin, I - TermBegin));
91*5f757f3fSDimitry Andric TermBegin = I + 1;
92*5f757f3fSDimitry Andric } else if (S[I] == '}') {
93*5f757f3fSDimitry Andric if (!CurrentBE)
94*5f757f3fSDimitry Andric continue;
95*5f757f3fSDimitry Andric if (CurrentBE->Terms.empty())
96*5f757f3fSDimitry Andric return make_error<StringError>(
97*5f757f3fSDimitry Andric "empty or singleton brace expansions are not supported",
98*5f757f3fSDimitry Andric errc::invalid_argument);
99*5f757f3fSDimitry Andric CurrentBE->Terms.push_back(S.substr(TermBegin, I - TermBegin));
100*5f757f3fSDimitry Andric CurrentBE->Length = I - CurrentBE->Start + 1;
101*5f757f3fSDimitry Andric CurrentBE = nullptr;
102*5f757f3fSDimitry Andric } else if (S[I] == '\\') {
103*5f757f3fSDimitry Andric if (++I == E)
104*5f757f3fSDimitry Andric return make_error<StringError>("invalid glob pattern, stray '\\'",
105*5f757f3fSDimitry Andric errc::invalid_argument);
106*5f757f3fSDimitry Andric }
107*5f757f3fSDimitry Andric }
108*5f757f3fSDimitry Andric if (CurrentBE)
109*5f757f3fSDimitry Andric return make_error<StringError>("incomplete brace expansion",
1100b57cec5SDimitry Andric errc::invalid_argument);
1110b57cec5SDimitry Andric
112*5f757f3fSDimitry Andric size_t NumSubPatterns = 1;
113*5f757f3fSDimitry Andric for (auto &BE : BraceExpansions) {
114*5f757f3fSDimitry Andric if (NumSubPatterns > std::numeric_limits<size_t>::max() / BE.Terms.size()) {
115*5f757f3fSDimitry Andric NumSubPatterns = std::numeric_limits<size_t>::max();
116*5f757f3fSDimitry Andric break;
1170b57cec5SDimitry Andric }
118*5f757f3fSDimitry Andric NumSubPatterns *= BE.Terms.size();
1190b57cec5SDimitry Andric }
120*5f757f3fSDimitry Andric if (NumSubPatterns > *MaxSubPatterns)
121*5f757f3fSDimitry Andric return make_error<StringError>("too many brace expansions",
122*5f757f3fSDimitry Andric errc::invalid_argument);
123*5f757f3fSDimitry Andric // Replace brace expansions in reverse order so that we don't invalidate
124*5f757f3fSDimitry Andric // earlier start indices
125*5f757f3fSDimitry Andric for (auto &BE : reverse(BraceExpansions)) {
126*5f757f3fSDimitry Andric SmallVector<std::string> OrigSubPatterns;
127*5f757f3fSDimitry Andric std::swap(SubPatterns, OrigSubPatterns);
128*5f757f3fSDimitry Andric for (StringRef Term : BE.Terms)
129*5f757f3fSDimitry Andric for (StringRef Orig : OrigSubPatterns)
130*5f757f3fSDimitry Andric SubPatterns.emplace_back(Orig).replace(BE.Start, BE.Length, Term);
1310b57cec5SDimitry Andric }
132*5f757f3fSDimitry Andric return std::move(SubPatterns);
1330b57cec5SDimitry Andric }
1340b57cec5SDimitry Andric
135*5f757f3fSDimitry Andric Expected<GlobPattern>
create(StringRef S,std::optional<size_t> MaxSubPatterns)136*5f757f3fSDimitry Andric GlobPattern::create(StringRef S, std::optional<size_t> MaxSubPatterns) {
1370b57cec5SDimitry Andric GlobPattern Pat;
1380b57cec5SDimitry Andric
139*5f757f3fSDimitry Andric // Store the prefix that does not contain any metacharacter.
140*5f757f3fSDimitry Andric size_t PrefixSize = S.find_first_of("?*[{\\");
141*5f757f3fSDimitry Andric Pat.Prefix = S.substr(0, PrefixSize);
142*5f757f3fSDimitry Andric if (PrefixSize == std::string::npos)
143*5f757f3fSDimitry Andric return Pat;
144*5f757f3fSDimitry Andric S = S.substr(PrefixSize);
145*5f757f3fSDimitry Andric
146*5f757f3fSDimitry Andric SmallVector<std::string, 1> SubPats;
147*5f757f3fSDimitry Andric if (auto Err = parseBraceExpansions(S, MaxSubPatterns).moveInto(SubPats))
148*5f757f3fSDimitry Andric return std::move(Err);
149*5f757f3fSDimitry Andric for (StringRef SubPat : SubPats) {
150*5f757f3fSDimitry Andric auto SubGlobOrErr = SubGlobPattern::create(SubPat);
151*5f757f3fSDimitry Andric if (!SubGlobOrErr)
152*5f757f3fSDimitry Andric return SubGlobOrErr.takeError();
153*5f757f3fSDimitry Andric Pat.SubGlobs.push_back(*SubGlobOrErr);
154*5f757f3fSDimitry Andric }
155*5f757f3fSDimitry Andric
1560b57cec5SDimitry Andric return Pat;
1570b57cec5SDimitry Andric }
1580b57cec5SDimitry Andric
159*5f757f3fSDimitry Andric Expected<GlobPattern::SubGlobPattern>
create(StringRef S)160*5f757f3fSDimitry Andric GlobPattern::SubGlobPattern::create(StringRef S) {
161*5f757f3fSDimitry Andric SubGlobPattern Pat;
1620b57cec5SDimitry Andric
163*5f757f3fSDimitry Andric // Parse brackets.
164*5f757f3fSDimitry Andric Pat.Pat.assign(S.begin(), S.end());
165*5f757f3fSDimitry Andric for (size_t I = 0, E = S.size(); I != E; ++I) {
166*5f757f3fSDimitry Andric if (S[I] == '[') {
167*5f757f3fSDimitry Andric // ']' is allowed as the first character of a character class. '[]' is
168*5f757f3fSDimitry Andric // invalid. So, just skip the first character.
169*5f757f3fSDimitry Andric ++I;
170*5f757f3fSDimitry Andric size_t J = S.find(']', I + 1);
171*5f757f3fSDimitry Andric if (J == StringRef::npos)
172*5f757f3fSDimitry Andric return make_error<StringError>("invalid glob pattern, unmatched '['",
173*5f757f3fSDimitry Andric errc::invalid_argument);
174*5f757f3fSDimitry Andric StringRef Chars = S.substr(I, J - I);
175*5f757f3fSDimitry Andric bool Invert = S[I] == '^' || S[I] == '!';
176*5f757f3fSDimitry Andric Expected<BitVector> BV =
177*5f757f3fSDimitry Andric Invert ? expand(Chars.substr(1), S) : expand(Chars, S);
1780b57cec5SDimitry Andric if (!BV)
1790b57cec5SDimitry Andric return BV.takeError();
180*5f757f3fSDimitry Andric if (Invert)
181*5f757f3fSDimitry Andric BV->flip();
182*5f757f3fSDimitry Andric Pat.Brackets.push_back(Bracket{J + 1, std::move(*BV)});
183*5f757f3fSDimitry Andric I = J;
184*5f757f3fSDimitry Andric } else if (S[I] == '\\') {
185*5f757f3fSDimitry Andric if (++I == E)
186*5f757f3fSDimitry Andric return make_error<StringError>("invalid glob pattern, stray '\\'",
187*5f757f3fSDimitry Andric errc::invalid_argument);
188*5f757f3fSDimitry Andric }
1890b57cec5SDimitry Andric }
1900b57cec5SDimitry Andric return Pat;
1910b57cec5SDimitry Andric }
1920b57cec5SDimitry Andric
match(StringRef S) const1930b57cec5SDimitry Andric bool GlobPattern::match(StringRef S) const {
194*5f757f3fSDimitry Andric if (!S.consume_front(Prefix))
195*5f757f3fSDimitry Andric return false;
196*5f757f3fSDimitry Andric if (SubGlobs.empty() && S.empty())
1970b57cec5SDimitry Andric return true;
198*5f757f3fSDimitry Andric for (auto &Glob : SubGlobs)
199*5f757f3fSDimitry Andric if (Glob.match(S))
2000b57cec5SDimitry Andric return true;
2010b57cec5SDimitry Andric return false;
2020b57cec5SDimitry Andric }
2030b57cec5SDimitry Andric
204*5f757f3fSDimitry Andric // Factor the pattern into segments split by '*'. The segment is matched
205*5f757f3fSDimitry Andric // sequentianlly by finding the first occurrence past the end of the previous
206*5f757f3fSDimitry Andric // match.
match(StringRef Str) const207*5f757f3fSDimitry Andric bool GlobPattern::SubGlobPattern::match(StringRef Str) const {
208*5f757f3fSDimitry Andric const char *P = Pat.data(), *SegmentBegin = nullptr, *S = Str.data(),
209*5f757f3fSDimitry Andric *SavedS = S;
210*5f757f3fSDimitry Andric const char *const PEnd = P + Pat.size(), *const End = S + Str.size();
211*5f757f3fSDimitry Andric size_t B = 0, SavedB = 0;
212*5f757f3fSDimitry Andric while (S != End) {
213*5f757f3fSDimitry Andric if (P == PEnd)
214*5f757f3fSDimitry Andric ;
215*5f757f3fSDimitry Andric else if (*P == '*') {
216*5f757f3fSDimitry Andric // The non-* substring on the left of '*' matches the tail of S. Save the
217*5f757f3fSDimitry Andric // positions to be used by backtracking if we see a mismatch later.
218*5f757f3fSDimitry Andric SegmentBegin = ++P;
219*5f757f3fSDimitry Andric SavedS = S;
220*5f757f3fSDimitry Andric SavedB = B;
221*5f757f3fSDimitry Andric continue;
222*5f757f3fSDimitry Andric } else if (*P == '[') {
223*5f757f3fSDimitry Andric if (Brackets[B].Bytes[uint8_t(*S)]) {
224*5f757f3fSDimitry Andric P = Pat.data() + Brackets[B++].NextOffset;
225*5f757f3fSDimitry Andric ++S;
226*5f757f3fSDimitry Andric continue;
2270b57cec5SDimitry Andric }
228*5f757f3fSDimitry Andric } else if (*P == '\\') {
229*5f757f3fSDimitry Andric if (*++P == *S) {
230*5f757f3fSDimitry Andric ++P;
231*5f757f3fSDimitry Andric ++S;
232*5f757f3fSDimitry Andric continue;
233*5f757f3fSDimitry Andric }
234*5f757f3fSDimitry Andric } else if (*P == *S || *P == '?') {
235*5f757f3fSDimitry Andric ++P;
236*5f757f3fSDimitry Andric ++S;
237*5f757f3fSDimitry Andric continue;
238*5f757f3fSDimitry Andric }
239*5f757f3fSDimitry Andric if (!SegmentBegin)
240*5f757f3fSDimitry Andric return false;
241*5f757f3fSDimitry Andric // We have seen a '*'. Backtrack to the saved positions. Shift the S
242*5f757f3fSDimitry Andric // position to probe the next starting position in the segment.
243*5f757f3fSDimitry Andric P = SegmentBegin;
244*5f757f3fSDimitry Andric S = ++SavedS;
245*5f757f3fSDimitry Andric B = SavedB;
246*5f757f3fSDimitry Andric }
247*5f757f3fSDimitry Andric // All bytes in Str have been matched. Return true if the rest part of Pat is
248*5f757f3fSDimitry Andric // empty or contains only '*'.
249*5f757f3fSDimitry Andric return getPat().find_first_not_of('*', P - Pat.data()) == std::string::npos;
2500b57cec5SDimitry Andric }
251