xref: /llvm-project/clang/utils/creduce-clang-crash.py (revision c49770c60f26e449379447109f7d915bd8de0384)
1#!/usr/bin/env python3
2"""Calls C-Reduce to create a minimal reproducer for clang crashes.
3Unknown arguments are treated at creduce options.
4
5Output files:
6  *.reduced.sh -- crash reproducer with minimal arguments
7  *.reduced.cpp -- the reduced file
8  *.test.sh -- interestingness test for C-Reduce
9"""
10
11from argparse import ArgumentParser, RawTextHelpFormatter
12import os
13import re
14import shutil
15import stat
16import sys
17import subprocess
18import shlex
19import tempfile
20import shutil
21import multiprocessing
22
23verbose = False
24creduce_cmd = None
25clang_cmd = None
26
27
28def verbose_print(*args, **kwargs):
29    if verbose:
30        print(*args, **kwargs)
31
32
33def check_file(fname):
34    fname = os.path.normpath(fname)
35    if not os.path.isfile(fname):
36        sys.exit("ERROR: %s does not exist" % (fname))
37    return fname
38
39
40def check_cmd(cmd_name, cmd_dir, cmd_path=None):
41    """
42    Returns absolute path to cmd_path if it is given,
43    or absolute path to cmd_dir/cmd_name.
44    """
45    if cmd_path:
46        # Make the path absolute so the creduce test can be run from any directory.
47        cmd_path = os.path.abspath(cmd_path)
48        cmd = shutil.which(cmd_path)
49        if cmd:
50            return cmd
51        sys.exit("ERROR: executable `%s` not found" % (cmd_path))
52
53    cmd = shutil.which(cmd_name, path=cmd_dir)
54    if cmd:
55        return cmd
56
57    if not cmd_dir:
58        cmd_dir = "$PATH"
59    sys.exit("ERROR: `%s` not found in %s" % (cmd_name, cmd_dir))
60
61
62def quote_cmd(cmd):
63    return " ".join(shlex.quote(arg) for arg in cmd)
64
65
66def write_to_script(text, filename):
67    with open(filename, "w") as f:
68        f.write(text)
69    os.chmod(filename, os.stat(filename).st_mode | stat.S_IEXEC)
70
71
72class Reduce(object):
73    def __init__(self, crash_script, file_to_reduce, creduce_flags):
74        crash_script_name, crash_script_ext = os.path.splitext(crash_script)
75        file_reduce_name, file_reduce_ext = os.path.splitext(file_to_reduce)
76
77        self.testfile = file_reduce_name + ".test.sh"
78        self.crash_script = crash_script_name + ".reduced" + crash_script_ext
79        self.file_to_reduce = file_reduce_name + ".reduced" + file_reduce_ext
80        shutil.copy(file_to_reduce, self.file_to_reduce)
81
82        self.clang = clang_cmd
83        self.clang_args = []
84        self.expected_output = []
85        self.needs_stack_trace = False
86        self.creduce_flags = ["--tidy"] + creduce_flags
87
88        self.read_clang_args(crash_script, file_to_reduce)
89        self.read_expected_output()
90
91    def get_crash_cmd(self, cmd=None, args=None, filename=None):
92        if not cmd:
93            cmd = self.clang
94        if not args:
95            args = self.clang_args
96        if not filename:
97            filename = self.file_to_reduce
98
99        return [cmd] + args + [filename]
100
101    def read_clang_args(self, crash_script, filename):
102        print("\nReading arguments from crash script...")
103        with open(crash_script) as f:
104            # Assume clang call is the first non comment line.
105            cmd = []
106            for line in f:
107                if not line.lstrip().startswith("#"):
108                    cmd = shlex.split(line)
109                    break
110        if not cmd:
111            sys.exit("Could not find command in the crash script.")
112
113        # Remove clang and filename from the command
114        # Assume the last occurrence of the filename is the clang input file
115        del cmd[0]
116        for i in range(len(cmd) - 1, -1, -1):
117            if cmd[i] == filename:
118                del cmd[i]
119                break
120        self.clang_args = cmd
121        verbose_print("Clang arguments:", quote_cmd(self.clang_args))
122
123    def read_expected_output(self):
124        print("\nGetting expected crash output...")
125        p = subprocess.Popen(
126            self.get_crash_cmd(), stdout=subprocess.PIPE, stderr=subprocess.STDOUT
127        )
128        crash_output, _ = p.communicate()
129        result = []
130
131        # Remove color codes
132        ansi_escape = r"\x1b\[[0-?]*m"
133        crash_output = re.sub(ansi_escape, "", crash_output.decode("utf-8"))
134
135        # Look for specific error messages
136        regexes = [
137            r"Assertion .+ failed",  # Linux assert()
138            r"Assertion failed: .+,",  # FreeBSD/Mac assert()
139            r"fatal error: error in backend: .+",
140            r"LLVM ERROR: .+",
141            r"UNREACHABLE executed at .+?!",
142            r"LLVM IR generation of declaration '.+'",
143            r"Generating code for declaration '.+'",
144            r"\*\*\* Bad machine code: .+ \*\*\*",
145            r"ERROR: .*Sanitizer: [^ ]+ ",
146        ]
147        for msg_re in regexes:
148            match = re.search(msg_re, crash_output)
149            if match:
150                msg = match.group(0)
151                result = [msg]
152                print("Found message:", msg)
153                break
154
155        # If no message was found, use the top five stack trace functions,
156        # ignoring some common functions
157        # Five is a somewhat arbitrary number; the goal is to get a small number
158        # of identifying functions with some leeway for common functions
159        if not result:
160            self.needs_stack_trace = True
161            stacktrace_re = r"[0-9]+\s+0[xX][0-9a-fA-F]+\s*([^(]+)\("
162            filters = [
163                "PrintStackTrace",
164                "RunSignalHandlers",
165                "CleanupOnSignal",
166                "HandleCrash",
167                "SignalHandler",
168                "__restore_rt",
169                "gsignal",
170                "abort",
171            ]
172
173            def skip_function(func_name):
174                return any(name in func_name for name in filters)
175
176            matches = re.findall(stacktrace_re, crash_output)
177            result = [x for x in matches if x and not skip_function(x)][:5]
178            for msg in result:
179                print("Found stack trace function:", msg)
180
181        if not result:
182            print("ERROR: no crash was found")
183            print("The crash output was:\n========\n%s========" % crash_output)
184            sys.exit(1)
185
186        self.expected_output = result
187
188    def check_expected_output(self, args=None, filename=None):
189        if not args:
190            args = self.clang_args
191        if not filename:
192            filename = self.file_to_reduce
193
194        p = subprocess.Popen(
195            self.get_crash_cmd(args=args, filename=filename),
196            stdout=subprocess.PIPE,
197            stderr=subprocess.STDOUT,
198        )
199        crash_output, _ = p.communicate()
200        return all(msg in crash_output.decode("utf-8") for msg in self.expected_output)
201
202    def write_interestingness_test(self):
203        print("\nCreating the interestingness test...")
204
205        # Disable symbolization if it's not required to avoid slow symbolization.
206        disable_symbolization = ""
207        if not self.needs_stack_trace:
208            disable_symbolization = "export LLVM_DISABLE_SYMBOLIZATION=1"
209
210        output = """#!/bin/bash
211%s
212if %s >& t.log ; then
213  exit 1
214fi
215""" % (
216            disable_symbolization,
217            quote_cmd(self.get_crash_cmd()),
218        )
219
220        for msg in self.expected_output:
221            output += "grep -F %s t.log || exit 1\n" % shlex.quote(msg)
222
223        write_to_script(output, self.testfile)
224        self.check_interestingness()
225
226    def check_interestingness(self):
227        testfile = os.path.abspath(self.testfile)
228
229        # Check that the test considers the original file interesting
230        returncode = subprocess.call(testfile, stdout=subprocess.DEVNULL)
231        if returncode:
232            sys.exit("The interestingness test does not pass for the original file.")
233
234        # Check that an empty file is not interesting
235        # Instead of modifying the filename in the test file, just run the command
236        with tempfile.NamedTemporaryFile() as empty_file:
237            is_interesting = self.check_expected_output(filename=empty_file.name)
238        if is_interesting:
239            sys.exit("The interestingness test passes for an empty file.")
240
241    def clang_preprocess(self):
242        print("\nTrying to preprocess the source file...")
243        with tempfile.NamedTemporaryFile() as tmpfile:
244            cmd_preprocess = self.get_crash_cmd() + ["-E", "-o", tmpfile.name]
245            cmd_preprocess_no_lines = cmd_preprocess + ["-P"]
246            try:
247                subprocess.check_call(cmd_preprocess_no_lines)
248                if self.check_expected_output(filename=tmpfile.name):
249                    print("Successfully preprocessed with line markers removed")
250                    shutil.copy(tmpfile.name, self.file_to_reduce)
251                else:
252                    subprocess.check_call(cmd_preprocess)
253                    if self.check_expected_output(filename=tmpfile.name):
254                        print("Successfully preprocessed without removing line markers")
255                        shutil.copy(tmpfile.name, self.file_to_reduce)
256                    else:
257                        print(
258                            "No longer crashes after preprocessing -- "
259                            "using original source"
260                        )
261            except subprocess.CalledProcessError:
262                print("Preprocessing failed")
263
264    @staticmethod
265    def filter_args(
266        args, opts_equal=[], opts_startswith=[], opts_one_arg_startswith=[]
267    ):
268        result = []
269        skip_next = False
270        for arg in args:
271            if skip_next:
272                skip_next = False
273                continue
274            if any(arg == a for a in opts_equal):
275                continue
276            if any(arg.startswith(a) for a in opts_startswith):
277                continue
278            if any(arg.startswith(a) for a in opts_one_arg_startswith):
279                skip_next = True
280                continue
281            result.append(arg)
282        return result
283
284    def try_remove_args(self, args, msg=None, extra_arg=None, **kwargs):
285        new_args = self.filter_args(args, **kwargs)
286
287        if extra_arg:
288            if extra_arg in new_args:
289                new_args.remove(extra_arg)
290            new_args.append(extra_arg)
291
292        if new_args != args and self.check_expected_output(args=new_args):
293            if msg:
294                verbose_print(msg)
295            return new_args
296        return args
297
298    def try_remove_arg_by_index(self, args, index):
299        new_args = args[:index] + args[index + 1 :]
300        removed_arg = args[index]
301
302        # Heuristic for grouping arguments:
303        # remove next argument if it doesn't start with "-"
304        if index < len(new_args) and not new_args[index].startswith("-"):
305            del new_args[index]
306            removed_arg += " " + args[index + 1]
307
308        if self.check_expected_output(args=new_args):
309            verbose_print("Removed", removed_arg)
310            return new_args, index
311        return args, index + 1
312
313    def simplify_clang_args(self):
314        """Simplify clang arguments before running C-Reduce to reduce the time the
315        interestingness test takes to run.
316        """
317        print("\nSimplifying the clang command...")
318        new_args = self.clang_args
319
320        # Remove the color diagnostics flag to make it easier to match error
321        # text.
322        new_args = self.try_remove_args(
323            new_args,
324            msg="Removed -fcolor-diagnostics",
325            opts_equal=["-fcolor-diagnostics"],
326        )
327
328        # Remove some clang arguments to speed up the interestingness test
329        new_args = self.try_remove_args(
330            new_args,
331            msg="Removed debug info options",
332            opts_startswith=["-gcodeview", "-debug-info-kind=", "-debugger-tuning="],
333        )
334
335        new_args = self.try_remove_args(
336            new_args, msg="Removed --show-includes", opts_startswith=["--show-includes"]
337        )
338        # Not suppressing warnings (-w) sometimes prevents the crash from occurring
339        # after preprocessing
340        new_args = self.try_remove_args(
341            new_args,
342            msg="Replaced -W options with -w",
343            extra_arg="-w",
344            opts_startswith=["-W"],
345        )
346        new_args = self.try_remove_args(
347            new_args,
348            msg="Replaced optimization level with -O0",
349            extra_arg="-O0",
350            opts_startswith=["-O"],
351        )
352
353        # Try to remove compilation steps
354        new_args = self.try_remove_args(
355            new_args, msg="Added -emit-llvm", extra_arg="-emit-llvm"
356        )
357        new_args = self.try_remove_args(
358            new_args, msg="Added -fsyntax-only", extra_arg="-fsyntax-only"
359        )
360
361        # Try to make implicit int an error for more sensible test output
362        new_args = self.try_remove_args(
363            new_args,
364            msg="Added -Werror=implicit-int",
365            opts_equal=["-w"],
366            extra_arg="-Werror=implicit-int",
367        )
368
369        self.clang_args = new_args
370        verbose_print("Simplified command:", quote_cmd(self.get_crash_cmd()))
371
372    def reduce_clang_args(self):
373        """Minimize the clang arguments after running C-Reduce, to get the smallest
374        command that reproduces the crash on the reduced file.
375        """
376        print("\nReducing the clang crash command...")
377
378        new_args = self.clang_args
379
380        # Remove some often occurring args
381        new_args = self.try_remove_args(
382            new_args, msg="Removed -D options", opts_startswith=["-D"]
383        )
384        new_args = self.try_remove_args(
385            new_args, msg="Removed -D options", opts_one_arg_startswith=["-D"]
386        )
387        new_args = self.try_remove_args(
388            new_args, msg="Removed -I options", opts_startswith=["-I"]
389        )
390        new_args = self.try_remove_args(
391            new_args, msg="Removed -I options", opts_one_arg_startswith=["-I"]
392        )
393        new_args = self.try_remove_args(
394            new_args, msg="Removed -W options", opts_startswith=["-W"]
395        )
396
397        # Remove other cases that aren't covered by the heuristic
398        new_args = self.try_remove_args(
399            new_args, msg="Removed -mllvm", opts_one_arg_startswith=["-mllvm"]
400        )
401
402        i = 0
403        while i < len(new_args):
404            new_args, i = self.try_remove_arg_by_index(new_args, i)
405
406        self.clang_args = new_args
407
408        reduced_cmd = quote_cmd(self.get_crash_cmd())
409        write_to_script(reduced_cmd, self.crash_script)
410        print("Reduced command:", reduced_cmd)
411
412    def run_creduce(self):
413        full_creduce_cmd = (
414            [creduce_cmd] + self.creduce_flags + [self.testfile, self.file_to_reduce]
415        )
416        print("\nRunning C-Reduce...")
417        verbose_print(quote_cmd(full_creduce_cmd))
418        try:
419            p = subprocess.Popen(full_creduce_cmd)
420            p.communicate()
421        except KeyboardInterrupt:
422            # Hack to kill C-Reduce because it jumps into its own pgid
423            print("\n\nctrl-c detected, killed creduce")
424            p.kill()
425
426
427def main():
428    global verbose
429    global creduce_cmd
430    global clang_cmd
431
432    parser = ArgumentParser(description=__doc__, formatter_class=RawTextHelpFormatter)
433    parser.add_argument(
434        "crash_script",
435        type=str,
436        nargs=1,
437        help="Name of the script that generates the crash.",
438    )
439    parser.add_argument(
440        "file_to_reduce", type=str, nargs=1, help="Name of the file to be reduced."
441    )
442    parser.add_argument(
443        "--llvm-bin", dest="llvm_bin", type=str, help="Path to the LLVM bin directory."
444    )
445    parser.add_argument(
446        "--clang",
447        dest="clang",
448        type=str,
449        help="The path to the `clang` executable. "
450        "By default uses the llvm-bin directory.",
451    )
452    parser.add_argument(
453        "--creduce",
454        dest="creduce",
455        type=str,
456        help="The path to the `creduce` executable. "
457        "Required if `creduce` is not in PATH environment.",
458    )
459    parser.add_argument("-v", "--verbose", action="store_true")
460    args, creduce_flags = parser.parse_known_args()
461    verbose = args.verbose
462    llvm_bin = os.path.abspath(args.llvm_bin) if args.llvm_bin else None
463    creduce_cmd = check_cmd("creduce", None, args.creduce)
464    clang_cmd = check_cmd("clang", llvm_bin, args.clang)
465
466    crash_script = check_file(args.crash_script[0])
467    file_to_reduce = check_file(args.file_to_reduce[0])
468
469    if "--n" not in creduce_flags:
470        creduce_flags += ["--n", str(max(4, multiprocessing.cpu_count() // 2))]
471
472    r = Reduce(crash_script, file_to_reduce, creduce_flags)
473
474    r.simplify_clang_args()
475    r.write_interestingness_test()
476    r.clang_preprocess()
477    r.run_creduce()
478    r.reduce_clang_args()
479
480
481if __name__ == "__main__":
482    main()
483