xref: /openbsd-src/gnu/llvm/clang/tools/scan-build-py/lib/libscanbuild/intercept.py (revision a9ac8606c53d55cee9c3a39778b249c51df111ef)
1*a9ac8606Spatrick# -*- coding: utf-8 -*-
2*a9ac8606Spatrick# Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
3*a9ac8606Spatrick# See https://llvm.org/LICENSE.txt for license information.
4*a9ac8606Spatrick# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
5*a9ac8606Spatrick""" This module is responsible to capture the compiler invocation of any
6*a9ac8606Spatrickbuild process. The result of that should be a compilation database.
7*a9ac8606Spatrick
8*a9ac8606SpatrickThis implementation is using the LD_PRELOAD or DYLD_INSERT_LIBRARIES
9*a9ac8606Spatrickmechanisms provided by the dynamic linker. The related library is implemented
10*a9ac8606Spatrickin C language and can be found under 'libear' directory.
11*a9ac8606Spatrick
12*a9ac8606SpatrickThe 'libear' library is capturing all child process creation and logging the
13*a9ac8606Spatrickrelevant information about it into separate files in a specified directory.
14*a9ac8606SpatrickThe parameter of this process is the output directory name, where the report
15*a9ac8606Spatrickfiles shall be placed. This parameter is passed as an environment variable.
16*a9ac8606Spatrick
17*a9ac8606SpatrickThe module also implements compiler wrappers to intercept the compiler calls.
18*a9ac8606Spatrick
19*a9ac8606SpatrickThe module implements the build command execution and the post-processing of
20*a9ac8606Spatrickthe output files, which will condensates into a compilation database. """
21*a9ac8606Spatrick
22*a9ac8606Spatrickimport sys
23*a9ac8606Spatrickimport os
24*a9ac8606Spatrickimport os.path
25*a9ac8606Spatrickimport re
26*a9ac8606Spatrickimport itertools
27*a9ac8606Spatrickimport json
28*a9ac8606Spatrickimport glob
29*a9ac8606Spatrickimport logging
30*a9ac8606Spatrickfrom libear import build_libear, TemporaryDirectory
31*a9ac8606Spatrickfrom libscanbuild import command_entry_point, compiler_wrapper, \
32*a9ac8606Spatrick    wrapper_environment, run_command, run_build
33*a9ac8606Spatrickfrom libscanbuild import duplicate_check
34*a9ac8606Spatrickfrom libscanbuild.compilation import split_command
35*a9ac8606Spatrickfrom libscanbuild.arguments import parse_args_for_intercept_build
36*a9ac8606Spatrickfrom libscanbuild.shell import encode, decode
37*a9ac8606Spatrick
38*a9ac8606Spatrick__all__ = ['capture', 'intercept_build', 'intercept_compiler_wrapper']
39*a9ac8606Spatrick
40*a9ac8606SpatrickGS = chr(0x1d)
41*a9ac8606SpatrickRS = chr(0x1e)
42*a9ac8606SpatrickUS = chr(0x1f)
43*a9ac8606Spatrick
44*a9ac8606SpatrickCOMPILER_WRAPPER_CC = 'intercept-cc'
45*a9ac8606SpatrickCOMPILER_WRAPPER_CXX = 'intercept-c++'
46*a9ac8606SpatrickTRACE_FILE_EXTENSION = '.cmd'  # same as in ear.c
47*a9ac8606SpatrickWRAPPER_ONLY_PLATFORMS = frozenset({'win32', 'cygwin'})
48*a9ac8606Spatrick
49*a9ac8606Spatrick
50*a9ac8606Spatrick@command_entry_point
51*a9ac8606Spatrickdef intercept_build():
52*a9ac8606Spatrick    """ Entry point for 'intercept-build' command. """
53*a9ac8606Spatrick
54*a9ac8606Spatrick    args = parse_args_for_intercept_build()
55*a9ac8606Spatrick    return capture(args)
56*a9ac8606Spatrick
57*a9ac8606Spatrick
58*a9ac8606Spatrickdef capture(args):
59*a9ac8606Spatrick    """ The entry point of build command interception. """
60*a9ac8606Spatrick
61*a9ac8606Spatrick    def post_processing(commands):
62*a9ac8606Spatrick        """ To make a compilation database, it needs to filter out commands
63*a9ac8606Spatrick        which are not compiler calls. Needs to find the source file name
64*a9ac8606Spatrick        from the arguments. And do shell escaping on the command.
65*a9ac8606Spatrick
66*a9ac8606Spatrick        To support incremental builds, it is desired to read elements from
67*a9ac8606Spatrick        an existing compilation database from a previous run. These elements
68*a9ac8606Spatrick        shall be merged with the new elements. """
69*a9ac8606Spatrick
70*a9ac8606Spatrick        # create entries from the current run
71*a9ac8606Spatrick        current = itertools.chain.from_iterable(
72*a9ac8606Spatrick            # creates a sequence of entry generators from an exec,
73*a9ac8606Spatrick            format_entry(command) for command in commands)
74*a9ac8606Spatrick        # read entries from previous run
75*a9ac8606Spatrick        if 'append' in args and args.append and os.path.isfile(args.cdb):
76*a9ac8606Spatrick            with open(args.cdb) as handle:
77*a9ac8606Spatrick                previous = iter(json.load(handle))
78*a9ac8606Spatrick        else:
79*a9ac8606Spatrick            previous = iter([])
80*a9ac8606Spatrick        # filter out duplicate entries from both
81*a9ac8606Spatrick        duplicate = duplicate_check(entry_hash)
82*a9ac8606Spatrick        return (entry
83*a9ac8606Spatrick                for entry in itertools.chain(previous, current)
84*a9ac8606Spatrick                if os.path.exists(entry['file']) and not duplicate(entry))
85*a9ac8606Spatrick
86*a9ac8606Spatrick    with TemporaryDirectory(prefix='intercept-') as tmp_dir:
87*a9ac8606Spatrick        # run the build command
88*a9ac8606Spatrick        environment = setup_environment(args, tmp_dir)
89*a9ac8606Spatrick        exit_code = run_build(args.build, env=environment)
90*a9ac8606Spatrick        # read the intercepted exec calls
91*a9ac8606Spatrick        exec_traces = itertools.chain.from_iterable(
92*a9ac8606Spatrick            parse_exec_trace(os.path.join(tmp_dir, filename))
93*a9ac8606Spatrick            for filename in sorted(glob.iglob(os.path.join(tmp_dir, '*.cmd'))))
94*a9ac8606Spatrick        # do post processing
95*a9ac8606Spatrick        entries = post_processing(exec_traces)
96*a9ac8606Spatrick        # dump the compilation database
97*a9ac8606Spatrick        with open(args.cdb, 'w+') as handle:
98*a9ac8606Spatrick            json.dump(list(entries), handle, sort_keys=True, indent=4)
99*a9ac8606Spatrick        return exit_code
100*a9ac8606Spatrick
101*a9ac8606Spatrick
102*a9ac8606Spatrickdef setup_environment(args, destination):
103*a9ac8606Spatrick    """ Sets up the environment for the build command.
104*a9ac8606Spatrick
105*a9ac8606Spatrick    It sets the required environment variables and execute the given command.
106*a9ac8606Spatrick    The exec calls will be logged by the 'libear' preloaded library or by the
107*a9ac8606Spatrick    'wrapper' programs. """
108*a9ac8606Spatrick
109*a9ac8606Spatrick    c_compiler = args.cc if 'cc' in args else 'cc'
110*a9ac8606Spatrick    cxx_compiler = args.cxx if 'cxx' in args else 'c++'
111*a9ac8606Spatrick
112*a9ac8606Spatrick    libear_path = None if args.override_compiler or is_preload_disabled(
113*a9ac8606Spatrick        sys.platform) else build_libear(c_compiler, destination)
114*a9ac8606Spatrick
115*a9ac8606Spatrick    environment = dict(os.environ)
116*a9ac8606Spatrick    environment.update({'INTERCEPT_BUILD_TARGET_DIR': destination})
117*a9ac8606Spatrick
118*a9ac8606Spatrick    if not libear_path:
119*a9ac8606Spatrick        logging.debug('intercept gonna use compiler wrappers')
120*a9ac8606Spatrick        environment.update(wrapper_environment(args))
121*a9ac8606Spatrick        environment.update({
122*a9ac8606Spatrick            'CC': COMPILER_WRAPPER_CC,
123*a9ac8606Spatrick            'CXX': COMPILER_WRAPPER_CXX
124*a9ac8606Spatrick        })
125*a9ac8606Spatrick    elif sys.platform == 'darwin':
126*a9ac8606Spatrick        logging.debug('intercept gonna preload libear on OSX')
127*a9ac8606Spatrick        environment.update({
128*a9ac8606Spatrick            'DYLD_INSERT_LIBRARIES': libear_path,
129*a9ac8606Spatrick            'DYLD_FORCE_FLAT_NAMESPACE': '1'
130*a9ac8606Spatrick        })
131*a9ac8606Spatrick    else:
132*a9ac8606Spatrick        logging.debug('intercept gonna preload libear on UNIX')
133*a9ac8606Spatrick        environment.update({'LD_PRELOAD': libear_path})
134*a9ac8606Spatrick
135*a9ac8606Spatrick    return environment
136*a9ac8606Spatrick
137*a9ac8606Spatrick
138*a9ac8606Spatrick@command_entry_point
139*a9ac8606Spatrickdef intercept_compiler_wrapper():
140*a9ac8606Spatrick    """ Entry point for `intercept-cc` and `intercept-c++`. """
141*a9ac8606Spatrick
142*a9ac8606Spatrick    return compiler_wrapper(intercept_compiler_wrapper_impl)
143*a9ac8606Spatrick
144*a9ac8606Spatrick
145*a9ac8606Spatrickdef intercept_compiler_wrapper_impl(_, execution):
146*a9ac8606Spatrick    """ Implement intercept compiler wrapper functionality.
147*a9ac8606Spatrick
148*a9ac8606Spatrick    It does generate execution report into target directory.
149*a9ac8606Spatrick    The target directory name is from environment variables. """
150*a9ac8606Spatrick
151*a9ac8606Spatrick    message_prefix = 'execution report might be incomplete: %s'
152*a9ac8606Spatrick
153*a9ac8606Spatrick    target_dir = os.getenv('INTERCEPT_BUILD_TARGET_DIR')
154*a9ac8606Spatrick    if not target_dir:
155*a9ac8606Spatrick        logging.warning(message_prefix, 'missing target directory')
156*a9ac8606Spatrick        return
157*a9ac8606Spatrick    # write current execution info to the pid file
158*a9ac8606Spatrick    try:
159*a9ac8606Spatrick        target_file_name = str(os.getpid()) + TRACE_FILE_EXTENSION
160*a9ac8606Spatrick        target_file = os.path.join(target_dir, target_file_name)
161*a9ac8606Spatrick        logging.debug('writing execution report to: %s', target_file)
162*a9ac8606Spatrick        write_exec_trace(target_file, execution)
163*a9ac8606Spatrick    except IOError:
164*a9ac8606Spatrick        logging.warning(message_prefix, 'io problem')
165*a9ac8606Spatrick
166*a9ac8606Spatrick
167*a9ac8606Spatrickdef write_exec_trace(filename, entry):
168*a9ac8606Spatrick    """ Write execution report file.
169*a9ac8606Spatrick
170*a9ac8606Spatrick    This method shall be sync with the execution report writer in interception
171*a9ac8606Spatrick    library. The entry in the file is a JSON objects.
172*a9ac8606Spatrick
173*a9ac8606Spatrick    :param filename:    path to the output execution trace file,
174*a9ac8606Spatrick    :param entry:       the Execution object to append to that file. """
175*a9ac8606Spatrick
176*a9ac8606Spatrick    with open(filename, 'ab') as handler:
177*a9ac8606Spatrick        pid = str(entry.pid)
178*a9ac8606Spatrick        command = US.join(entry.cmd) + US
179*a9ac8606Spatrick        content = RS.join([pid, pid, 'wrapper', entry.cwd, command]) + GS
180*a9ac8606Spatrick        handler.write(content.encode('utf-8'))
181*a9ac8606Spatrick
182*a9ac8606Spatrick
183*a9ac8606Spatrickdef parse_exec_trace(filename):
184*a9ac8606Spatrick    """ Parse the file generated by the 'libear' preloaded library.
185*a9ac8606Spatrick
186*a9ac8606Spatrick    Given filename points to a file which contains the basic report
187*a9ac8606Spatrick    generated by the interception library or wrapper command. A single
188*a9ac8606Spatrick    report file _might_ contain multiple process creation info. """
189*a9ac8606Spatrick
190*a9ac8606Spatrick    logging.debug('parse exec trace file: %s', filename)
191*a9ac8606Spatrick    with open(filename, 'r') as handler:
192*a9ac8606Spatrick        content = handler.read()
193*a9ac8606Spatrick        for group in filter(bool, content.split(GS)):
194*a9ac8606Spatrick            records = group.split(RS)
195*a9ac8606Spatrick            yield {
196*a9ac8606Spatrick                'pid': records[0],
197*a9ac8606Spatrick                'ppid': records[1],
198*a9ac8606Spatrick                'function': records[2],
199*a9ac8606Spatrick                'directory': records[3],
200*a9ac8606Spatrick                'command': records[4].split(US)[:-1]
201*a9ac8606Spatrick            }
202*a9ac8606Spatrick
203*a9ac8606Spatrick
204*a9ac8606Spatrickdef format_entry(exec_trace):
205*a9ac8606Spatrick    """ Generate the desired fields for compilation database entries. """
206*a9ac8606Spatrick
207*a9ac8606Spatrick    def abspath(cwd, name):
208*a9ac8606Spatrick        """ Create normalized absolute path from input filename. """
209*a9ac8606Spatrick        fullname = name if os.path.isabs(name) else os.path.join(cwd, name)
210*a9ac8606Spatrick        return os.path.normpath(fullname)
211*a9ac8606Spatrick
212*a9ac8606Spatrick    logging.debug('format this command: %s', exec_trace['command'])
213*a9ac8606Spatrick    compilation = split_command(exec_trace['command'])
214*a9ac8606Spatrick    if compilation:
215*a9ac8606Spatrick        for source in compilation.files:
216*a9ac8606Spatrick            compiler = 'c++' if compilation.compiler == 'c++' else 'cc'
217*a9ac8606Spatrick            command = [compiler, '-c'] + compilation.flags + [source]
218*a9ac8606Spatrick            logging.debug('formated as: %s', command)
219*a9ac8606Spatrick            yield {
220*a9ac8606Spatrick                'directory': exec_trace['directory'],
221*a9ac8606Spatrick                'command': encode(command),
222*a9ac8606Spatrick                'file': abspath(exec_trace['directory'], source)
223*a9ac8606Spatrick            }
224*a9ac8606Spatrick
225*a9ac8606Spatrick
226*a9ac8606Spatrickdef is_preload_disabled(platform):
227*a9ac8606Spatrick    """ Library-based interposition will fail silently if SIP is enabled,
228*a9ac8606Spatrick    so this should be detected. You can detect whether SIP is enabled on
229*a9ac8606Spatrick    Darwin by checking whether (1) there is a binary called 'csrutil' in
230*a9ac8606Spatrick    the path and, if so, (2) whether the output of executing 'csrutil status'
231*a9ac8606Spatrick    contains 'System Integrity Protection status: enabled'.
232*a9ac8606Spatrick
233*a9ac8606Spatrick    :param platform: name of the platform (returned by sys.platform),
234*a9ac8606Spatrick    :return: True if library preload will fail by the dynamic linker. """
235*a9ac8606Spatrick
236*a9ac8606Spatrick    if platform in WRAPPER_ONLY_PLATFORMS:
237*a9ac8606Spatrick        return True
238*a9ac8606Spatrick    elif platform == 'darwin':
239*a9ac8606Spatrick        command = ['csrutil', 'status']
240*a9ac8606Spatrick        pattern = re.compile(r'System Integrity Protection status:\s+enabled')
241*a9ac8606Spatrick        try:
242*a9ac8606Spatrick            return any(pattern.match(line) for line in run_command(command))
243*a9ac8606Spatrick        except:
244*a9ac8606Spatrick            return False
245*a9ac8606Spatrick    else:
246*a9ac8606Spatrick        return False
247*a9ac8606Spatrick
248*a9ac8606Spatrick
249*a9ac8606Spatrickdef entry_hash(entry):
250*a9ac8606Spatrick    """ Implement unique hash method for compilation database entries. """
251*a9ac8606Spatrick
252*a9ac8606Spatrick    # For faster lookup in set filename is reverted
253*a9ac8606Spatrick    filename = entry['file'][::-1]
254*a9ac8606Spatrick    # For faster lookup in set directory is reverted
255*a9ac8606Spatrick    directory = entry['directory'][::-1]
256*a9ac8606Spatrick    # On OS X the 'cc' and 'c++' compilers are wrappers for
257*a9ac8606Spatrick    # 'clang' therefore both call would be logged. To avoid
258*a9ac8606Spatrick    # this the hash does not contain the first word of the
259*a9ac8606Spatrick    # command.
260*a9ac8606Spatrick    command = ' '.join(decode(entry['command'])[1:])
261*a9ac8606Spatrick
262*a9ac8606Spatrick    return '<>'.join([filename, directory, command])
263