xref: /llvm-project/clang/tools/scan-build-py/lib/libscanbuild/intercept.py (revision dd3c26a045c081620375a878159f536758baba6e)
1d9cf8291SDaniel Hwang# -*- coding: utf-8 -*-
2d9cf8291SDaniel Hwang# Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
3d9cf8291SDaniel Hwang# See https://llvm.org/LICENSE.txt for license information.
4d9cf8291SDaniel Hwang# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
5d9cf8291SDaniel Hwang""" This module is responsible to capture the compiler invocation of any
6d9cf8291SDaniel Hwangbuild process. The result of that should be a compilation database.
7d9cf8291SDaniel Hwang
8d9cf8291SDaniel HwangThis implementation is using the LD_PRELOAD or DYLD_INSERT_LIBRARIES
9d9cf8291SDaniel Hwangmechanisms provided by the dynamic linker. The related library is implemented
10d9cf8291SDaniel Hwangin C language and can be found under 'libear' directory.
11d9cf8291SDaniel Hwang
12d9cf8291SDaniel HwangThe 'libear' library is capturing all child process creation and logging the
13d9cf8291SDaniel Hwangrelevant information about it into separate files in a specified directory.
14d9cf8291SDaniel HwangThe parameter of this process is the output directory name, where the report
15d9cf8291SDaniel Hwangfiles shall be placed. This parameter is passed as an environment variable.
16d9cf8291SDaniel Hwang
17d9cf8291SDaniel HwangThe module also implements compiler wrappers to intercept the compiler calls.
18d9cf8291SDaniel Hwang
19d9cf8291SDaniel HwangThe module implements the build command execution and the post-processing of
20d9cf8291SDaniel Hwangthe output files, which will condensates into a compilation database. """
21d9cf8291SDaniel Hwang
22d9cf8291SDaniel Hwangimport sys
23d9cf8291SDaniel Hwangimport os
24d9cf8291SDaniel Hwangimport os.path
25d9cf8291SDaniel Hwangimport re
26d9cf8291SDaniel Hwangimport itertools
27d9cf8291SDaniel Hwangimport json
28d9cf8291SDaniel Hwangimport glob
29d9cf8291SDaniel Hwangimport logging
30d9cf8291SDaniel Hwangfrom libear import build_libear, TemporaryDirectory
31*dd3c26a0STobias Hietafrom libscanbuild import (
32*dd3c26a0STobias Hieta    command_entry_point,
33*dd3c26a0STobias Hieta    compiler_wrapper,
34*dd3c26a0STobias Hieta    wrapper_environment,
35*dd3c26a0STobias Hieta    run_command,
36*dd3c26a0STobias Hieta    run_build,
37*dd3c26a0STobias Hieta)
38d9cf8291SDaniel Hwangfrom libscanbuild import duplicate_check
39d9cf8291SDaniel Hwangfrom libscanbuild.compilation import split_command
40d9cf8291SDaniel Hwangfrom libscanbuild.arguments import parse_args_for_intercept_build
41d9cf8291SDaniel Hwangfrom libscanbuild.shell import encode, decode
42d9cf8291SDaniel Hwang
43*dd3c26a0STobias Hieta__all__ = ["capture", "intercept_build", "intercept_compiler_wrapper"]
44d9cf8291SDaniel Hwang
45*dd3c26a0STobias HietaGS = chr(0x1D)
46*dd3c26a0STobias HietaRS = chr(0x1E)
47*dd3c26a0STobias HietaUS = chr(0x1F)
48d9cf8291SDaniel Hwang
49*dd3c26a0STobias HietaCOMPILER_WRAPPER_CC = "intercept-cc"
50*dd3c26a0STobias HietaCOMPILER_WRAPPER_CXX = "intercept-c++"
51*dd3c26a0STobias HietaTRACE_FILE_EXTENSION = ".cmd"  # same as in ear.c
52*dd3c26a0STobias HietaWRAPPER_ONLY_PLATFORMS = frozenset({"win32", "cygwin"})
53d9cf8291SDaniel Hwang
54d9cf8291SDaniel Hwang
55d9cf8291SDaniel Hwang@command_entry_point
56d9cf8291SDaniel Hwangdef intercept_build():
57d9cf8291SDaniel Hwang    """Entry point for 'intercept-build' command."""
58d9cf8291SDaniel Hwang
59d9cf8291SDaniel Hwang    args = parse_args_for_intercept_build()
60d9cf8291SDaniel Hwang    return capture(args)
61d9cf8291SDaniel Hwang
62d9cf8291SDaniel Hwang
63d9cf8291SDaniel Hwangdef capture(args):
64d9cf8291SDaniel Hwang    """The entry point of build command interception."""
65d9cf8291SDaniel Hwang
66d9cf8291SDaniel Hwang    def post_processing(commands):
67d9cf8291SDaniel Hwang        """To make a compilation database, it needs to filter out commands
68d9cf8291SDaniel Hwang        which are not compiler calls. Needs to find the source file name
69d9cf8291SDaniel Hwang        from the arguments. And do shell escaping on the command.
70d9cf8291SDaniel Hwang
71d9cf8291SDaniel Hwang        To support incremental builds, it is desired to read elements from
72d9cf8291SDaniel Hwang        an existing compilation database from a previous run. These elements
73d9cf8291SDaniel Hwang        shall be merged with the new elements."""
74d9cf8291SDaniel Hwang
75d9cf8291SDaniel Hwang        # create entries from the current run
76d9cf8291SDaniel Hwang        current = itertools.chain.from_iterable(
77d9cf8291SDaniel Hwang            # creates a sequence of entry generators from an exec,
78*dd3c26a0STobias Hieta            format_entry(command)
79*dd3c26a0STobias Hieta            for command in commands
80*dd3c26a0STobias Hieta        )
81d9cf8291SDaniel Hwang        # read entries from previous run
82*dd3c26a0STobias Hieta        if "append" in args and args.append and os.path.isfile(args.cdb):
83d9cf8291SDaniel Hwang            with open(args.cdb) as handle:
84d9cf8291SDaniel Hwang                previous = iter(json.load(handle))
85d9cf8291SDaniel Hwang        else:
86d9cf8291SDaniel Hwang            previous = iter([])
87d9cf8291SDaniel Hwang        # filter out duplicate entries from both
88d9cf8291SDaniel Hwang        duplicate = duplicate_check(entry_hash)
89*dd3c26a0STobias Hieta        return (
90*dd3c26a0STobias Hieta            entry
91d9cf8291SDaniel Hwang            for entry in itertools.chain(previous, current)
92*dd3c26a0STobias Hieta            if os.path.exists(entry["file"]) and not duplicate(entry)
93*dd3c26a0STobias Hieta        )
94d9cf8291SDaniel Hwang
95*dd3c26a0STobias Hieta    with TemporaryDirectory(prefix="intercept-") as tmp_dir:
96d9cf8291SDaniel Hwang        # run the build command
97d9cf8291SDaniel Hwang        environment = setup_environment(args, tmp_dir)
98d9cf8291SDaniel Hwang        exit_code = run_build(args.build, env=environment)
99d9cf8291SDaniel Hwang        # read the intercepted exec calls
100d9cf8291SDaniel Hwang        exec_traces = itertools.chain.from_iterable(
101d9cf8291SDaniel Hwang            parse_exec_trace(os.path.join(tmp_dir, filename))
102*dd3c26a0STobias Hieta            for filename in sorted(glob.iglob(os.path.join(tmp_dir, "*.cmd")))
103*dd3c26a0STobias Hieta        )
104d9cf8291SDaniel Hwang        # do post processing
105d9cf8291SDaniel Hwang        entries = post_processing(exec_traces)
106d9cf8291SDaniel Hwang        # dump the compilation database
107*dd3c26a0STobias Hieta        with open(args.cdb, "w+") as handle:
108d9cf8291SDaniel Hwang            json.dump(list(entries), handle, sort_keys=True, indent=4)
109d9cf8291SDaniel Hwang        return exit_code
110d9cf8291SDaniel Hwang
111d9cf8291SDaniel Hwang
112d9cf8291SDaniel Hwangdef setup_environment(args, destination):
113d9cf8291SDaniel Hwang    """Sets up the environment for the build command.
114d9cf8291SDaniel Hwang
115d9cf8291SDaniel Hwang    It sets the required environment variables and execute the given command.
116d9cf8291SDaniel Hwang    The exec calls will be logged by the 'libear' preloaded library or by the
117d9cf8291SDaniel Hwang    'wrapper' programs."""
118d9cf8291SDaniel Hwang
119*dd3c26a0STobias Hieta    c_compiler = args.cc if "cc" in args else "cc"
120*dd3c26a0STobias Hieta    cxx_compiler = args.cxx if "cxx" in args else "c++"
121d9cf8291SDaniel Hwang
122*dd3c26a0STobias Hieta    libear_path = (
123*dd3c26a0STobias Hieta        None
124*dd3c26a0STobias Hieta        if args.override_compiler or is_preload_disabled(sys.platform)
125*dd3c26a0STobias Hieta        else build_libear(c_compiler, destination)
126*dd3c26a0STobias Hieta    )
127d9cf8291SDaniel Hwang
128d9cf8291SDaniel Hwang    environment = dict(os.environ)
129*dd3c26a0STobias Hieta    environment.update({"INTERCEPT_BUILD_TARGET_DIR": destination})
130d9cf8291SDaniel Hwang
131d9cf8291SDaniel Hwang    if not libear_path:
132*dd3c26a0STobias Hieta        logging.debug("intercept gonna use compiler wrappers")
133d9cf8291SDaniel Hwang        environment.update(wrapper_environment(args))
134*dd3c26a0STobias Hieta        environment.update({"CC": COMPILER_WRAPPER_CC, "CXX": COMPILER_WRAPPER_CXX})
135*dd3c26a0STobias Hieta    elif sys.platform == "darwin":
136*dd3c26a0STobias Hieta        logging.debug("intercept gonna preload libear on OSX")
137*dd3c26a0STobias Hieta        environment.update(
138*dd3c26a0STobias Hieta            {"DYLD_INSERT_LIBRARIES": libear_path, "DYLD_FORCE_FLAT_NAMESPACE": "1"}
139*dd3c26a0STobias Hieta        )
140d9cf8291SDaniel Hwang    else:
141*dd3c26a0STobias Hieta        logging.debug("intercept gonna preload libear on UNIX")
142*dd3c26a0STobias Hieta        environment.update({"LD_PRELOAD": libear_path})
143d9cf8291SDaniel Hwang
144d9cf8291SDaniel Hwang    return environment
145d9cf8291SDaniel Hwang
146d9cf8291SDaniel Hwang
147d9cf8291SDaniel Hwang@command_entry_point
148d9cf8291SDaniel Hwangdef intercept_compiler_wrapper():
149d9cf8291SDaniel Hwang    """Entry point for `intercept-cc` and `intercept-c++`."""
150d9cf8291SDaniel Hwang
151d9cf8291SDaniel Hwang    return compiler_wrapper(intercept_compiler_wrapper_impl)
152d9cf8291SDaniel Hwang
153d9cf8291SDaniel Hwang
154d9cf8291SDaniel Hwangdef intercept_compiler_wrapper_impl(_, execution):
155d9cf8291SDaniel Hwang    """Implement intercept compiler wrapper functionality.
156d9cf8291SDaniel Hwang
157d9cf8291SDaniel Hwang    It does generate execution report into target directory.
158d9cf8291SDaniel Hwang    The target directory name is from environment variables."""
159d9cf8291SDaniel Hwang
160*dd3c26a0STobias Hieta    message_prefix = "execution report might be incomplete: %s"
161d9cf8291SDaniel Hwang
162*dd3c26a0STobias Hieta    target_dir = os.getenv("INTERCEPT_BUILD_TARGET_DIR")
163d9cf8291SDaniel Hwang    if not target_dir:
164*dd3c26a0STobias Hieta        logging.warning(message_prefix, "missing target directory")
165d9cf8291SDaniel Hwang        return
166d9cf8291SDaniel Hwang    # write current execution info to the pid file
167d9cf8291SDaniel Hwang    try:
168d9cf8291SDaniel Hwang        target_file_name = str(os.getpid()) + TRACE_FILE_EXTENSION
169d9cf8291SDaniel Hwang        target_file = os.path.join(target_dir, target_file_name)
170*dd3c26a0STobias Hieta        logging.debug("writing execution report to: %s", target_file)
171d9cf8291SDaniel Hwang        write_exec_trace(target_file, execution)
172d9cf8291SDaniel Hwang    except IOError:
173*dd3c26a0STobias Hieta        logging.warning(message_prefix, "io problem")
174d9cf8291SDaniel Hwang
175d9cf8291SDaniel Hwang
176d9cf8291SDaniel Hwangdef write_exec_trace(filename, entry):
177d9cf8291SDaniel Hwang    """Write execution report file.
178d9cf8291SDaniel Hwang
179d9cf8291SDaniel Hwang    This method shall be sync with the execution report writer in interception
180d9cf8291SDaniel Hwang    library. The entry in the file is a JSON objects.
181d9cf8291SDaniel Hwang
182d9cf8291SDaniel Hwang    :param filename:    path to the output execution trace file,
183d9cf8291SDaniel Hwang    :param entry:       the Execution object to append to that file."""
184d9cf8291SDaniel Hwang
185*dd3c26a0STobias Hieta    with open(filename, "ab") as handler:
186d9cf8291SDaniel Hwang        pid = str(entry.pid)
187d9cf8291SDaniel Hwang        command = US.join(entry.cmd) + US
188*dd3c26a0STobias Hieta        content = RS.join([pid, pid, "wrapper", entry.cwd, command]) + GS
189*dd3c26a0STobias Hieta        handler.write(content.encode("utf-8"))
190d9cf8291SDaniel Hwang
191d9cf8291SDaniel Hwang
192d9cf8291SDaniel Hwangdef parse_exec_trace(filename):
193d9cf8291SDaniel Hwang    """Parse the file generated by the 'libear' preloaded library.
194d9cf8291SDaniel Hwang
195d9cf8291SDaniel Hwang    Given filename points to a file which contains the basic report
196d9cf8291SDaniel Hwang    generated by the interception library or wrapper command. A single
197d9cf8291SDaniel Hwang    report file _might_ contain multiple process creation info."""
198d9cf8291SDaniel Hwang
199*dd3c26a0STobias Hieta    logging.debug("parse exec trace file: %s", filename)
200*dd3c26a0STobias Hieta    with open(filename, "r") as handler:
201d9cf8291SDaniel Hwang        content = handler.read()
202d9cf8291SDaniel Hwang        for group in filter(bool, content.split(GS)):
203d9cf8291SDaniel Hwang            records = group.split(RS)
204d9cf8291SDaniel Hwang            yield {
205*dd3c26a0STobias Hieta                "pid": records[0],
206*dd3c26a0STobias Hieta                "ppid": records[1],
207*dd3c26a0STobias Hieta                "function": records[2],
208*dd3c26a0STobias Hieta                "directory": records[3],
209*dd3c26a0STobias Hieta                "command": records[4].split(US)[:-1],
210d9cf8291SDaniel Hwang            }
211d9cf8291SDaniel Hwang
212d9cf8291SDaniel Hwang
213d9cf8291SDaniel Hwangdef format_entry(exec_trace):
214d9cf8291SDaniel Hwang    """Generate the desired fields for compilation database entries."""
215d9cf8291SDaniel Hwang
216d9cf8291SDaniel Hwang    def abspath(cwd, name):
217d9cf8291SDaniel Hwang        """Create normalized absolute path from input filename."""
218d9cf8291SDaniel Hwang        fullname = name if os.path.isabs(name) else os.path.join(cwd, name)
219d9cf8291SDaniel Hwang        return os.path.normpath(fullname)
220d9cf8291SDaniel Hwang
221*dd3c26a0STobias Hieta    logging.debug("format this command: %s", exec_trace["command"])
222*dd3c26a0STobias Hieta    compilation = split_command(exec_trace["command"])
223d9cf8291SDaniel Hwang    if compilation:
224d9cf8291SDaniel Hwang        for source in compilation.files:
225*dd3c26a0STobias Hieta            compiler = "c++" if compilation.compiler == "c++" else "cc"
226*dd3c26a0STobias Hieta            command = [compiler, "-c"] + compilation.flags + [source]
227*dd3c26a0STobias Hieta            logging.debug("formated as: %s", command)
228d9cf8291SDaniel Hwang            yield {
229*dd3c26a0STobias Hieta                "directory": exec_trace["directory"],
230*dd3c26a0STobias Hieta                "command": encode(command),
231*dd3c26a0STobias Hieta                "file": abspath(exec_trace["directory"], source),
232d9cf8291SDaniel Hwang            }
233d9cf8291SDaniel Hwang
234d9cf8291SDaniel Hwang
235d9cf8291SDaniel Hwangdef is_preload_disabled(platform):
236d9cf8291SDaniel Hwang    """Library-based interposition will fail silently if SIP is enabled,
237d9cf8291SDaniel Hwang    so this should be detected. You can detect whether SIP is enabled on
238d9cf8291SDaniel Hwang    Darwin by checking whether (1) there is a binary called 'csrutil' in
239d9cf8291SDaniel Hwang    the path and, if so, (2) whether the output of executing 'csrutil status'
240d9cf8291SDaniel Hwang    contains 'System Integrity Protection status: enabled'.
241d9cf8291SDaniel Hwang
242d9cf8291SDaniel Hwang    :param platform: name of the platform (returned by sys.platform),
243d9cf8291SDaniel Hwang    :return: True if library preload will fail by the dynamic linker."""
244d9cf8291SDaniel Hwang
245d9cf8291SDaniel Hwang    if platform in WRAPPER_ONLY_PLATFORMS:
246d9cf8291SDaniel Hwang        return True
247*dd3c26a0STobias Hieta    elif platform == "darwin":
248*dd3c26a0STobias Hieta        command = ["csrutil", "status"]
249*dd3c26a0STobias Hieta        pattern = re.compile(r"System Integrity Protection status:\s+enabled")
250d9cf8291SDaniel Hwang        try:
251d9cf8291SDaniel Hwang            return any(pattern.match(line) for line in run_command(command))
252d9cf8291SDaniel Hwang        except:
253d9cf8291SDaniel Hwang            return False
254d9cf8291SDaniel Hwang    else:
255d9cf8291SDaniel Hwang        return False
256d9cf8291SDaniel Hwang
257d9cf8291SDaniel Hwang
258d9cf8291SDaniel Hwangdef entry_hash(entry):
259d9cf8291SDaniel Hwang    """Implement unique hash method for compilation database entries."""
260d9cf8291SDaniel Hwang
261d9cf8291SDaniel Hwang    # For faster lookup in set filename is reverted
262*dd3c26a0STobias Hieta    filename = entry["file"][::-1]
263d9cf8291SDaniel Hwang    # For faster lookup in set directory is reverted
264*dd3c26a0STobias Hieta    directory = entry["directory"][::-1]
265d9cf8291SDaniel Hwang    # On OS X the 'cc' and 'c++' compilers are wrappers for
266d9cf8291SDaniel Hwang    # 'clang' therefore both call would be logged. To avoid
267d9cf8291SDaniel Hwang    # this the hash does not contain the first word of the
268d9cf8291SDaniel Hwang    # command.
269*dd3c26a0STobias Hieta    command = " ".join(decode(entry["command"])[1:])
270d9cf8291SDaniel Hwang
271*dd3c26a0STobias Hieta    return "<>".join([filename, directory, command])
272