1from __future__ import absolute_import 2import itertools 3 4import lit.util 5from lit.ShCommands import Command, GlobItem, Pipeline, Seq 6 7 8class ShLexer: 9 def __init__(self, data, win32Escapes=False): 10 self.data = data 11 self.pos = 0 12 self.end = len(data) 13 self.win32Escapes = win32Escapes 14 15 def eat(self): 16 c = self.data[self.pos] 17 self.pos += 1 18 return c 19 20 def look(self): 21 return self.data[self.pos] 22 23 def maybe_eat(self, c): 24 """ 25 maybe_eat(c) - Consume the character c if it is the next character, 26 returning True if a character was consumed.""" 27 if self.data[self.pos] == c: 28 self.pos += 1 29 return True 30 return False 31 32 def lex_arg_fast(self, c): 33 # Get the leading whitespace free section. 34 chunk = self.data[self.pos - 1 :].split(None, 1)[0] 35 36 # If it has special characters, the fast path failed. 37 if ( 38 "|" in chunk 39 or "&" in chunk 40 or "<" in chunk 41 or ">" in chunk 42 or "'" in chunk 43 or '"' in chunk 44 or ";" in chunk 45 or "\\" in chunk 46 ): 47 return None 48 49 self.pos = self.pos - 1 + len(chunk) 50 return GlobItem(chunk) if "*" in chunk or "?" in chunk else chunk 51 52 def lex_arg_slow(self, c): 53 if c in "'\"": 54 str = self.lex_arg_quoted(c) 55 else: 56 str = c 57 unquoted_glob_char = False 58 quoted_glob_char = False 59 while self.pos != self.end: 60 c = self.look() 61 if c.isspace() or c in "|&;": 62 break 63 elif c in "><": 64 # This is an annoying case; we treat '2>' as a single token so 65 # we don't have to track whitespace tokens. 66 67 # If the parse string isn't an integer, do the usual thing. 68 if not str.isdigit(): 69 break 70 71 # Otherwise, lex the operator and convert to a redirection 72 # token. 73 num = int(str) 74 tok = self.lex_one_token() 75 assert isinstance(tok, tuple) and len(tok) == 1 76 return (tok[0], num) 77 elif c == '"' or c == "'": 78 self.eat() 79 quoted_arg = self.lex_arg_quoted(c) 80 if "*" in quoted_arg or "?" in quoted_arg: 81 quoted_glob_char = True 82 str += quoted_arg 83 elif not self.win32Escapes and c == "\\": 84 # Outside of a string, '\\' escapes everything. 85 self.eat() 86 if self.pos == self.end: 87 lit.util.warning( 88 "escape at end of quoted argument in: %r" % self.data 89 ) 90 return str 91 str += self.eat() 92 elif c in "*?": 93 unquoted_glob_char = True 94 str += self.eat() 95 else: 96 str += self.eat() 97 # If a quote character is present, lex_arg_quoted will remove the quotes 98 # and append the argument directly. This causes a problem when the 99 # quoted portion contains a glob character, as the character will no 100 # longer be treated literally. If glob characters occur *only* inside 101 # of quotes, then we can handle this by not globbing at all, and if 102 # glob characters occur *only* outside of quotes, we can still glob just 103 # fine. But if a glob character occurs both inside and outside of 104 # quotes this presents a problem. In practice this is such an obscure 105 # edge case that it doesn't seem worth the added complexity to support. 106 # By adding an assertion, it means some bot somewhere will catch this 107 # and flag the user of a non-portable test (which could almost certainly 108 # be re-written to work correctly without triggering this). 109 assert not (quoted_glob_char and unquoted_glob_char) 110 return GlobItem(str) if unquoted_glob_char else str 111 112 def lex_arg_quoted(self, delim): 113 str = "" 114 while self.pos != self.end: 115 c = self.eat() 116 if c == delim: 117 return str 118 elif c == "\\" and delim == '"': 119 # Inside a '"' quoted string, '\\' only escapes the quote 120 # character and backslash, otherwise it is preserved. 121 if self.pos == self.end: 122 lit.util.warning( 123 "escape at end of quoted argument in: %r" % self.data 124 ) 125 return str 126 c = self.eat() 127 if c == '"': # 128 str += '"' 129 elif c == "\\": 130 str += "\\" 131 else: 132 str += "\\" + c 133 else: 134 str += c 135 lit.util.warning("missing quote character in %r" % self.data) 136 return str 137 138 def lex_arg_checked(self, c): 139 pos = self.pos 140 res = self.lex_arg_fast(c) 141 end = self.pos 142 143 self.pos = pos 144 reference = self.lex_arg_slow(c) 145 if res is not None: 146 if res != reference: 147 raise ValueError("Fast path failure: %r != %r" % (res, reference)) 148 if self.pos != end: 149 raise ValueError("Fast path failure: %r != %r" % (self.pos, end)) 150 return reference 151 152 def lex_arg(self, c): 153 return self.lex_arg_fast(c) or self.lex_arg_slow(c) 154 155 def lex_one_token(self): 156 """ 157 lex_one_token - Lex a single 'sh' token.""" 158 159 c = self.eat() 160 if c == ";": 161 return (c,) 162 if c == "|": 163 if self.maybe_eat("|"): 164 return ("||",) 165 return (c,) 166 if c == "&": 167 if self.maybe_eat("&"): 168 return ("&&",) 169 if self.maybe_eat(">"): 170 return ("&>",) 171 return (c,) 172 if c == ">": 173 if self.maybe_eat("&"): 174 return (">&",) 175 if self.maybe_eat(">"): 176 return (">>",) 177 return (c,) 178 if c == "<": 179 if self.maybe_eat("&"): 180 return ("<&",) 181 if self.maybe_eat(">"): 182 return ("<<",) 183 return (c,) 184 185 return self.lex_arg(c) 186 187 def lex(self): 188 while self.pos != self.end: 189 if self.look().isspace(): 190 self.eat() 191 else: 192 yield self.lex_one_token() 193 194 195### 196 197 198class ShParser: 199 def __init__(self, data, win32Escapes=False, pipefail=False): 200 self.data = data 201 self.pipefail = pipefail 202 self.tokens = ShLexer(data, win32Escapes=win32Escapes).lex() 203 204 def lex(self): 205 for item in self.tokens: 206 return item 207 return None 208 209 def look(self): 210 token = self.lex() 211 if token is not None: 212 self.tokens = itertools.chain([token], self.tokens) 213 return token 214 215 def parse_command(self): 216 tok = self.lex() 217 if not tok: 218 raise ValueError("empty command!") 219 if isinstance(tok, tuple): 220 raise ValueError("syntax error near unexpected token %r" % tok[0]) 221 222 args = [tok] 223 redirects = [] 224 while 1: 225 tok = self.look() 226 227 # EOF? 228 if tok is None: 229 break 230 231 # If this is an argument, just add it to the current command. 232 if isinstance(tok, (str, GlobItem)): 233 args.append(self.lex()) 234 continue 235 236 # Otherwise see if it is a terminator. 237 assert isinstance(tok, tuple) 238 if tok[0] in ("|", ";", "&", "||", "&&"): 239 break 240 241 # Otherwise it must be a redirection. 242 op = self.lex() 243 arg = self.lex() 244 if not arg: 245 raise ValueError("syntax error near token %r" % op[0]) 246 redirects.append((op, arg)) 247 248 return Command(args, redirects) 249 250 def parse_pipeline(self): 251 negate = False 252 253 commands = [self.parse_command()] 254 while self.look() == ("|",): 255 self.lex() 256 commands.append(self.parse_command()) 257 return Pipeline(commands, negate, self.pipefail) 258 259 def parse(self): 260 lhs = self.parse_pipeline() 261 262 while self.look(): 263 operator = self.lex() 264 assert isinstance(operator, tuple) and len(operator) == 1 265 266 if not self.look(): 267 raise ValueError("missing argument to operator %r" % operator[0]) 268 269 # FIXME: Operator precedence!! 270 lhs = Seq(lhs, operator[0], self.parse_pipeline()) 271 272 return lhs 273