xref: /llvm-project/llvm/utils/lit/lit/ShUtil.py (revision b71edfaa4ec3c998aadb35255ce2f60bba2940b0)
1from __future__ import absolute_import
2import itertools
3
4import lit.util
5from lit.ShCommands import Command, GlobItem, Pipeline, Seq
6
7
8class ShLexer:
9    def __init__(self, data, win32Escapes=False):
10        self.data = data
11        self.pos = 0
12        self.end = len(data)
13        self.win32Escapes = win32Escapes
14
15    def eat(self):
16        c = self.data[self.pos]
17        self.pos += 1
18        return c
19
20    def look(self):
21        return self.data[self.pos]
22
23    def maybe_eat(self, c):
24        """
25        maybe_eat(c) - Consume the character c if it is the next character,
26        returning True if a character was consumed."""
27        if self.data[self.pos] == c:
28            self.pos += 1
29            return True
30        return False
31
32    def lex_arg_fast(self, c):
33        # Get the leading whitespace free section.
34        chunk = self.data[self.pos - 1 :].split(None, 1)[0]
35
36        # If it has special characters, the fast path failed.
37        if (
38            "|" in chunk
39            or "&" in chunk
40            or "<" in chunk
41            or ">" in chunk
42            or "'" in chunk
43            or '"' in chunk
44            or ";" in chunk
45            or "\\" in chunk
46        ):
47            return None
48
49        self.pos = self.pos - 1 + len(chunk)
50        return GlobItem(chunk) if "*" in chunk or "?" in chunk else chunk
51
52    def lex_arg_slow(self, c):
53        if c in "'\"":
54            str = self.lex_arg_quoted(c)
55        else:
56            str = c
57        unquoted_glob_char = False
58        quoted_glob_char = False
59        while self.pos != self.end:
60            c = self.look()
61            if c.isspace() or c in "|&;":
62                break
63            elif c in "><":
64                # This is an annoying case; we treat '2>' as a single token so
65                # we don't have to track whitespace tokens.
66
67                # If the parse string isn't an integer, do the usual thing.
68                if not str.isdigit():
69                    break
70
71                # Otherwise, lex the operator and convert to a redirection
72                # token.
73                num = int(str)
74                tok = self.lex_one_token()
75                assert isinstance(tok, tuple) and len(tok) == 1
76                return (tok[0], num)
77            elif c == '"' or c == "'":
78                self.eat()
79                quoted_arg = self.lex_arg_quoted(c)
80                if "*" in quoted_arg or "?" in quoted_arg:
81                    quoted_glob_char = True
82                str += quoted_arg
83            elif not self.win32Escapes and c == "\\":
84                # Outside of a string, '\\' escapes everything.
85                self.eat()
86                if self.pos == self.end:
87                    lit.util.warning(
88                        "escape at end of quoted argument in: %r" % self.data
89                    )
90                    return str
91                str += self.eat()
92            elif c in "*?":
93                unquoted_glob_char = True
94                str += self.eat()
95            else:
96                str += self.eat()
97        # If a quote character is present, lex_arg_quoted will remove the quotes
98        # and append the argument directly.  This causes a problem when the
99        # quoted portion contains a glob character, as the character will no
100        # longer be treated literally.  If glob characters occur *only* inside
101        # of quotes, then we can handle this by not globbing at all, and if
102        # glob characters occur *only* outside of quotes, we can still glob just
103        # fine.  But if a glob character occurs both inside and outside of
104        # quotes this presents a problem.  In practice this is such an obscure
105        # edge case that it doesn't seem worth the added complexity to support.
106        # By adding an assertion, it means some bot somewhere will catch this
107        # and flag the user of a non-portable test (which could almost certainly
108        # be re-written to work correctly without triggering this).
109        assert not (quoted_glob_char and unquoted_glob_char)
110        return GlobItem(str) if unquoted_glob_char else str
111
112    def lex_arg_quoted(self, delim):
113        str = ""
114        while self.pos != self.end:
115            c = self.eat()
116            if c == delim:
117                return str
118            elif c == "\\" and delim == '"':
119                # Inside a '"' quoted string, '\\' only escapes the quote
120                # character and backslash, otherwise it is preserved.
121                if self.pos == self.end:
122                    lit.util.warning(
123                        "escape at end of quoted argument in: %r" % self.data
124                    )
125                    return str
126                c = self.eat()
127                if c == '"':  #
128                    str += '"'
129                elif c == "\\":
130                    str += "\\"
131                else:
132                    str += "\\" + c
133            else:
134                str += c
135        lit.util.warning("missing quote character in %r" % self.data)
136        return str
137
138    def lex_arg_checked(self, c):
139        pos = self.pos
140        res = self.lex_arg_fast(c)
141        end = self.pos
142
143        self.pos = pos
144        reference = self.lex_arg_slow(c)
145        if res is not None:
146            if res != reference:
147                raise ValueError("Fast path failure: %r != %r" % (res, reference))
148            if self.pos != end:
149                raise ValueError("Fast path failure: %r != %r" % (self.pos, end))
150        return reference
151
152    def lex_arg(self, c):
153        return self.lex_arg_fast(c) or self.lex_arg_slow(c)
154
155    def lex_one_token(self):
156        """
157        lex_one_token - Lex a single 'sh' token."""
158
159        c = self.eat()
160        if c == ";":
161            return (c,)
162        if c == "|":
163            if self.maybe_eat("|"):
164                return ("||",)
165            return (c,)
166        if c == "&":
167            if self.maybe_eat("&"):
168                return ("&&",)
169            if self.maybe_eat(">"):
170                return ("&>",)
171            return (c,)
172        if c == ">":
173            if self.maybe_eat("&"):
174                return (">&",)
175            if self.maybe_eat(">"):
176                return (">>",)
177            return (c,)
178        if c == "<":
179            if self.maybe_eat("&"):
180                return ("<&",)
181            if self.maybe_eat(">"):
182                return ("<<",)
183            return (c,)
184
185        return self.lex_arg(c)
186
187    def lex(self):
188        while self.pos != self.end:
189            if self.look().isspace():
190                self.eat()
191            else:
192                yield self.lex_one_token()
193
194
195###
196
197
198class ShParser:
199    def __init__(self, data, win32Escapes=False, pipefail=False):
200        self.data = data
201        self.pipefail = pipefail
202        self.tokens = ShLexer(data, win32Escapes=win32Escapes).lex()
203
204    def lex(self):
205        for item in self.tokens:
206            return item
207        return None
208
209    def look(self):
210        token = self.lex()
211        if token is not None:
212            self.tokens = itertools.chain([token], self.tokens)
213        return token
214
215    def parse_command(self):
216        tok = self.lex()
217        if not tok:
218            raise ValueError("empty command!")
219        if isinstance(tok, tuple):
220            raise ValueError("syntax error near unexpected token %r" % tok[0])
221
222        args = [tok]
223        redirects = []
224        while 1:
225            tok = self.look()
226
227            # EOF?
228            if tok is None:
229                break
230
231            # If this is an argument, just add it to the current command.
232            if isinstance(tok, (str, GlobItem)):
233                args.append(self.lex())
234                continue
235
236            # Otherwise see if it is a terminator.
237            assert isinstance(tok, tuple)
238            if tok[0] in ("|", ";", "&", "||", "&&"):
239                break
240
241            # Otherwise it must be a redirection.
242            op = self.lex()
243            arg = self.lex()
244            if not arg:
245                raise ValueError("syntax error near token %r" % op[0])
246            redirects.append((op, arg))
247
248        return Command(args, redirects)
249
250    def parse_pipeline(self):
251        negate = False
252
253        commands = [self.parse_command()]
254        while self.look() == ("|",):
255            self.lex()
256            commands.append(self.parse_command())
257        return Pipeline(commands, negate, self.pipefail)
258
259    def parse(self):
260        lhs = self.parse_pipeline()
261
262        while self.look():
263            operator = self.lex()
264            assert isinstance(operator, tuple) and len(operator) == 1
265
266            if not self.look():
267                raise ValueError("missing argument to operator %r" % operator[0])
268
269            # FIXME: Operator precedence!!
270            lhs = Seq(lhs, operator[0], self.parse_pipeline())
271
272        return lhs
273