xref: /netbsd-src/external/apache2/llvm/dist/clang/tools/scan-build-py/libscanbuild/intercept.py (revision 7330f729ccf0bd976a06f95fad452fe774fc7fd1)
1*7330f729Sjoerg# -*- coding: utf-8 -*-
2*7330f729Sjoerg# Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
3*7330f729Sjoerg# See https://llvm.org/LICENSE.txt for license information.
4*7330f729Sjoerg# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
5*7330f729Sjoerg""" This module is responsible to capture the compiler invocation of any
6*7330f729Sjoergbuild process. The result of that should be a compilation database.
7*7330f729Sjoerg
8*7330f729SjoergThis implementation is using the LD_PRELOAD or DYLD_INSERT_LIBRARIES
9*7330f729Sjoergmechanisms provided by the dynamic linker. The related library is implemented
10*7330f729Sjoergin C language and can be found under 'libear' directory.
11*7330f729Sjoerg
12*7330f729SjoergThe 'libear' library is capturing all child process creation and logging the
13*7330f729Sjoergrelevant information about it into separate files in a specified directory.
14*7330f729SjoergThe parameter of this process is the output directory name, where the report
15*7330f729Sjoergfiles shall be placed. This parameter is passed as an environment variable.
16*7330f729Sjoerg
17*7330f729SjoergThe module also implements compiler wrappers to intercept the compiler calls.
18*7330f729Sjoerg
19*7330f729SjoergThe module implements the build command execution and the post-processing of
20*7330f729Sjoergthe output files, which will condensates into a compilation database. """
21*7330f729Sjoerg
22*7330f729Sjoergimport sys
23*7330f729Sjoergimport os
24*7330f729Sjoergimport os.path
25*7330f729Sjoergimport re
26*7330f729Sjoergimport itertools
27*7330f729Sjoergimport json
28*7330f729Sjoergimport glob
29*7330f729Sjoergimport logging
30*7330f729Sjoergfrom libear import build_libear, TemporaryDirectory
31*7330f729Sjoergfrom libscanbuild import command_entry_point, compiler_wrapper, \
32*7330f729Sjoerg    wrapper_environment, run_command, run_build
33*7330f729Sjoergfrom libscanbuild import duplicate_check
34*7330f729Sjoergfrom libscanbuild.compilation import split_command
35*7330f729Sjoergfrom libscanbuild.arguments import parse_args_for_intercept_build
36*7330f729Sjoergfrom libscanbuild.shell import encode, decode
37*7330f729Sjoerg
38*7330f729Sjoerg__all__ = ['capture', 'intercept_build', 'intercept_compiler_wrapper']
39*7330f729Sjoerg
40*7330f729SjoergGS = chr(0x1d)
41*7330f729SjoergRS = chr(0x1e)
42*7330f729SjoergUS = chr(0x1f)
43*7330f729Sjoerg
44*7330f729SjoergCOMPILER_WRAPPER_CC = 'intercept-cc'
45*7330f729SjoergCOMPILER_WRAPPER_CXX = 'intercept-c++'
46*7330f729SjoergTRACE_FILE_EXTENSION = '.cmd'  # same as in ear.c
47*7330f729SjoergWRAPPER_ONLY_PLATFORMS = frozenset({'win32', 'cygwin'})
48*7330f729Sjoerg
49*7330f729Sjoerg
50*7330f729Sjoerg@command_entry_point
51*7330f729Sjoergdef intercept_build():
52*7330f729Sjoerg    """ Entry point for 'intercept-build' command. """
53*7330f729Sjoerg
54*7330f729Sjoerg    args = parse_args_for_intercept_build()
55*7330f729Sjoerg    return capture(args)
56*7330f729Sjoerg
57*7330f729Sjoerg
58*7330f729Sjoergdef capture(args):
59*7330f729Sjoerg    """ The entry point of build command interception. """
60*7330f729Sjoerg
61*7330f729Sjoerg    def post_processing(commands):
62*7330f729Sjoerg        """ To make a compilation database, it needs to filter out commands
63*7330f729Sjoerg        which are not compiler calls. Needs to find the source file name
64*7330f729Sjoerg        from the arguments. And do shell escaping on the command.
65*7330f729Sjoerg
66*7330f729Sjoerg        To support incremental builds, it is desired to read elements from
67*7330f729Sjoerg        an existing compilation database from a previous run. These elements
68*7330f729Sjoerg        shall be merged with the new elements. """
69*7330f729Sjoerg
70*7330f729Sjoerg        # create entries from the current run
71*7330f729Sjoerg        current = itertools.chain.from_iterable(
72*7330f729Sjoerg            # creates a sequence of entry generators from an exec,
73*7330f729Sjoerg            format_entry(command) for command in commands)
74*7330f729Sjoerg        # read entries from previous run
75*7330f729Sjoerg        if 'append' in args and args.append and os.path.isfile(args.cdb):
76*7330f729Sjoerg            with open(args.cdb) as handle:
77*7330f729Sjoerg                previous = iter(json.load(handle))
78*7330f729Sjoerg        else:
79*7330f729Sjoerg            previous = iter([])
80*7330f729Sjoerg        # filter out duplicate entries from both
81*7330f729Sjoerg        duplicate = duplicate_check(entry_hash)
82*7330f729Sjoerg        return (entry
83*7330f729Sjoerg                for entry in itertools.chain(previous, current)
84*7330f729Sjoerg                if os.path.exists(entry['file']) and not duplicate(entry))
85*7330f729Sjoerg
86*7330f729Sjoerg    with TemporaryDirectory(prefix='intercept-') as tmp_dir:
87*7330f729Sjoerg        # run the build command
88*7330f729Sjoerg        environment = setup_environment(args, tmp_dir)
89*7330f729Sjoerg        exit_code = run_build(args.build, env=environment)
90*7330f729Sjoerg        # read the intercepted exec calls
91*7330f729Sjoerg        exec_traces = itertools.chain.from_iterable(
92*7330f729Sjoerg            parse_exec_trace(os.path.join(tmp_dir, filename))
93*7330f729Sjoerg            for filename in sorted(glob.iglob(os.path.join(tmp_dir, '*.cmd'))))
94*7330f729Sjoerg        # do post processing
95*7330f729Sjoerg        entries = post_processing(exec_traces)
96*7330f729Sjoerg        # dump the compilation database
97*7330f729Sjoerg        with open(args.cdb, 'w+') as handle:
98*7330f729Sjoerg            json.dump(list(entries), handle, sort_keys=True, indent=4)
99*7330f729Sjoerg        return exit_code
100*7330f729Sjoerg
101*7330f729Sjoerg
102*7330f729Sjoergdef setup_environment(args, destination):
103*7330f729Sjoerg    """ Sets up the environment for the build command.
104*7330f729Sjoerg
105*7330f729Sjoerg    It sets the required environment variables and execute the given command.
106*7330f729Sjoerg    The exec calls will be logged by the 'libear' preloaded library or by the
107*7330f729Sjoerg    'wrapper' programs. """
108*7330f729Sjoerg
109*7330f729Sjoerg    c_compiler = args.cc if 'cc' in args else 'cc'
110*7330f729Sjoerg    cxx_compiler = args.cxx if 'cxx' in args else 'c++'
111*7330f729Sjoerg
112*7330f729Sjoerg    libear_path = None if args.override_compiler or is_preload_disabled(
113*7330f729Sjoerg        sys.platform) else build_libear(c_compiler, destination)
114*7330f729Sjoerg
115*7330f729Sjoerg    environment = dict(os.environ)
116*7330f729Sjoerg    environment.update({'INTERCEPT_BUILD_TARGET_DIR': destination})
117*7330f729Sjoerg
118*7330f729Sjoerg    if not libear_path:
119*7330f729Sjoerg        logging.debug('intercept gonna use compiler wrappers')
120*7330f729Sjoerg        environment.update(wrapper_environment(args))
121*7330f729Sjoerg        environment.update({
122*7330f729Sjoerg            'CC': COMPILER_WRAPPER_CC,
123*7330f729Sjoerg            'CXX': COMPILER_WRAPPER_CXX
124*7330f729Sjoerg        })
125*7330f729Sjoerg    elif sys.platform == 'darwin':
126*7330f729Sjoerg        logging.debug('intercept gonna preload libear on OSX')
127*7330f729Sjoerg        environment.update({
128*7330f729Sjoerg            'DYLD_INSERT_LIBRARIES': libear_path,
129*7330f729Sjoerg            'DYLD_FORCE_FLAT_NAMESPACE': '1'
130*7330f729Sjoerg        })
131*7330f729Sjoerg    else:
132*7330f729Sjoerg        logging.debug('intercept gonna preload libear on UNIX')
133*7330f729Sjoerg        environment.update({'LD_PRELOAD': libear_path})
134*7330f729Sjoerg
135*7330f729Sjoerg    return environment
136*7330f729Sjoerg
137*7330f729Sjoerg
138*7330f729Sjoerg@command_entry_point
139*7330f729Sjoergdef intercept_compiler_wrapper():
140*7330f729Sjoerg    """ Entry point for `intercept-cc` and `intercept-c++`. """
141*7330f729Sjoerg
142*7330f729Sjoerg    return compiler_wrapper(intercept_compiler_wrapper_impl)
143*7330f729Sjoerg
144*7330f729Sjoerg
145*7330f729Sjoergdef intercept_compiler_wrapper_impl(_, execution):
146*7330f729Sjoerg    """ Implement intercept compiler wrapper functionality.
147*7330f729Sjoerg
148*7330f729Sjoerg    It does generate execution report into target directory.
149*7330f729Sjoerg    The target directory name is from environment variables. """
150*7330f729Sjoerg
151*7330f729Sjoerg    message_prefix = 'execution report might be incomplete: %s'
152*7330f729Sjoerg
153*7330f729Sjoerg    target_dir = os.getenv('INTERCEPT_BUILD_TARGET_DIR')
154*7330f729Sjoerg    if not target_dir:
155*7330f729Sjoerg        logging.warning(message_prefix, 'missing target directory')
156*7330f729Sjoerg        return
157*7330f729Sjoerg    # write current execution info to the pid file
158*7330f729Sjoerg    try:
159*7330f729Sjoerg        target_file_name = str(os.getpid()) + TRACE_FILE_EXTENSION
160*7330f729Sjoerg        target_file = os.path.join(target_dir, target_file_name)
161*7330f729Sjoerg        logging.debug('writing execution report to: %s', target_file)
162*7330f729Sjoerg        write_exec_trace(target_file, execution)
163*7330f729Sjoerg    except IOError:
164*7330f729Sjoerg        logging.warning(message_prefix, 'io problem')
165*7330f729Sjoerg
166*7330f729Sjoerg
167*7330f729Sjoergdef write_exec_trace(filename, entry):
168*7330f729Sjoerg    """ Write execution report file.
169*7330f729Sjoerg
170*7330f729Sjoerg    This method shall be sync with the execution report writer in interception
171*7330f729Sjoerg    library. The entry in the file is a JSON objects.
172*7330f729Sjoerg
173*7330f729Sjoerg    :param filename:    path to the output execution trace file,
174*7330f729Sjoerg    :param entry:       the Execution object to append to that file. """
175*7330f729Sjoerg
176*7330f729Sjoerg    with open(filename, 'ab') as handler:
177*7330f729Sjoerg        pid = str(entry.pid)
178*7330f729Sjoerg        command = US.join(entry.cmd) + US
179*7330f729Sjoerg        content = RS.join([pid, pid, 'wrapper', entry.cwd, command]) + GS
180*7330f729Sjoerg        handler.write(content.encode('utf-8'))
181*7330f729Sjoerg
182*7330f729Sjoerg
183*7330f729Sjoergdef parse_exec_trace(filename):
184*7330f729Sjoerg    """ Parse the file generated by the 'libear' preloaded library.
185*7330f729Sjoerg
186*7330f729Sjoerg    Given filename points to a file which contains the basic report
187*7330f729Sjoerg    generated by the interception library or wrapper command. A single
188*7330f729Sjoerg    report file _might_ contain multiple process creation info. """
189*7330f729Sjoerg
190*7330f729Sjoerg    logging.debug('parse exec trace file: %s', filename)
191*7330f729Sjoerg    with open(filename, 'r') as handler:
192*7330f729Sjoerg        content = handler.read()
193*7330f729Sjoerg        for group in filter(bool, content.split(GS)):
194*7330f729Sjoerg            records = group.split(RS)
195*7330f729Sjoerg            yield {
196*7330f729Sjoerg                'pid': records[0],
197*7330f729Sjoerg                'ppid': records[1],
198*7330f729Sjoerg                'function': records[2],
199*7330f729Sjoerg                'directory': records[3],
200*7330f729Sjoerg                'command': records[4].split(US)[:-1]
201*7330f729Sjoerg            }
202*7330f729Sjoerg
203*7330f729Sjoerg
204*7330f729Sjoergdef format_entry(exec_trace):
205*7330f729Sjoerg    """ Generate the desired fields for compilation database entries. """
206*7330f729Sjoerg
207*7330f729Sjoerg    def abspath(cwd, name):
208*7330f729Sjoerg        """ Create normalized absolute path from input filename. """
209*7330f729Sjoerg        fullname = name if os.path.isabs(name) else os.path.join(cwd, name)
210*7330f729Sjoerg        return os.path.normpath(fullname)
211*7330f729Sjoerg
212*7330f729Sjoerg    logging.debug('format this command: %s', exec_trace['command'])
213*7330f729Sjoerg    compilation = split_command(exec_trace['command'])
214*7330f729Sjoerg    if compilation:
215*7330f729Sjoerg        for source in compilation.files:
216*7330f729Sjoerg            compiler = 'c++' if compilation.compiler == 'c++' else 'cc'
217*7330f729Sjoerg            command = [compiler, '-c'] + compilation.flags + [source]
218*7330f729Sjoerg            logging.debug('formated as: %s', command)
219*7330f729Sjoerg            yield {
220*7330f729Sjoerg                'directory': exec_trace['directory'],
221*7330f729Sjoerg                'command': encode(command),
222*7330f729Sjoerg                'file': abspath(exec_trace['directory'], source)
223*7330f729Sjoerg            }
224*7330f729Sjoerg
225*7330f729Sjoerg
226*7330f729Sjoergdef is_preload_disabled(platform):
227*7330f729Sjoerg    """ Library-based interposition will fail silently if SIP is enabled,
228*7330f729Sjoerg    so this should be detected. You can detect whether SIP is enabled on
229*7330f729Sjoerg    Darwin by checking whether (1) there is a binary called 'csrutil' in
230*7330f729Sjoerg    the path and, if so, (2) whether the output of executing 'csrutil status'
231*7330f729Sjoerg    contains 'System Integrity Protection status: enabled'.
232*7330f729Sjoerg
233*7330f729Sjoerg    :param platform: name of the platform (returned by sys.platform),
234*7330f729Sjoerg    :return: True if library preload will fail by the dynamic linker. """
235*7330f729Sjoerg
236*7330f729Sjoerg    if platform in WRAPPER_ONLY_PLATFORMS:
237*7330f729Sjoerg        return True
238*7330f729Sjoerg    elif platform == 'darwin':
239*7330f729Sjoerg        command = ['csrutil', 'status']
240*7330f729Sjoerg        pattern = re.compile(r'System Integrity Protection status:\s+enabled')
241*7330f729Sjoerg        try:
242*7330f729Sjoerg            return any(pattern.match(line) for line in run_command(command))
243*7330f729Sjoerg        except:
244*7330f729Sjoerg            return False
245*7330f729Sjoerg    else:
246*7330f729Sjoerg        return False
247*7330f729Sjoerg
248*7330f729Sjoerg
249*7330f729Sjoergdef entry_hash(entry):
250*7330f729Sjoerg    """ Implement unique hash method for compilation database entries. """
251*7330f729Sjoerg
252*7330f729Sjoerg    # For faster lookup in set filename is reverted
253*7330f729Sjoerg    filename = entry['file'][::-1]
254*7330f729Sjoerg    # For faster lookup in set directory is reverted
255*7330f729Sjoerg    directory = entry['directory'][::-1]
256*7330f729Sjoerg    # On OS X the 'cc' and 'c++' compilers are wrappers for
257*7330f729Sjoerg    # 'clang' therefore both call would be logged. To avoid
258*7330f729Sjoerg    # this the hash does not contain the first word of the
259*7330f729Sjoerg    # command.
260*7330f729Sjoerg    command = ' '.join(decode(entry['command'])[1:])
261*7330f729Sjoerg
262*7330f729Sjoerg    return '<>'.join([filename, directory, command])
263