1*7330f729Sjoerg# -*- coding: utf-8 -*- 2*7330f729Sjoerg# Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 3*7330f729Sjoerg# See https://llvm.org/LICENSE.txt for license information. 4*7330f729Sjoerg# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 5*7330f729Sjoerg""" This module is responsible to capture the compiler invocation of any 6*7330f729Sjoergbuild process. The result of that should be a compilation database. 7*7330f729Sjoerg 8*7330f729SjoergThis implementation is using the LD_PRELOAD or DYLD_INSERT_LIBRARIES 9*7330f729Sjoergmechanisms provided by the dynamic linker. The related library is implemented 10*7330f729Sjoergin C language and can be found under 'libear' directory. 11*7330f729Sjoerg 12*7330f729SjoergThe 'libear' library is capturing all child process creation and logging the 13*7330f729Sjoergrelevant information about it into separate files in a specified directory. 14*7330f729SjoergThe parameter of this process is the output directory name, where the report 15*7330f729Sjoergfiles shall be placed. This parameter is passed as an environment variable. 16*7330f729Sjoerg 17*7330f729SjoergThe module also implements compiler wrappers to intercept the compiler calls. 18*7330f729Sjoerg 19*7330f729SjoergThe module implements the build command execution and the post-processing of 20*7330f729Sjoergthe output files, which will condensates into a compilation database. """ 21*7330f729Sjoerg 22*7330f729Sjoergimport sys 23*7330f729Sjoergimport os 24*7330f729Sjoergimport os.path 25*7330f729Sjoergimport re 26*7330f729Sjoergimport itertools 27*7330f729Sjoergimport json 28*7330f729Sjoergimport glob 29*7330f729Sjoergimport logging 30*7330f729Sjoergfrom libear import build_libear, TemporaryDirectory 31*7330f729Sjoergfrom libscanbuild import command_entry_point, compiler_wrapper, \ 32*7330f729Sjoerg wrapper_environment, run_command, run_build 33*7330f729Sjoergfrom libscanbuild import duplicate_check 34*7330f729Sjoergfrom libscanbuild.compilation import split_command 35*7330f729Sjoergfrom libscanbuild.arguments import parse_args_for_intercept_build 36*7330f729Sjoergfrom libscanbuild.shell import encode, decode 37*7330f729Sjoerg 38*7330f729Sjoerg__all__ = ['capture', 'intercept_build', 'intercept_compiler_wrapper'] 39*7330f729Sjoerg 40*7330f729SjoergGS = chr(0x1d) 41*7330f729SjoergRS = chr(0x1e) 42*7330f729SjoergUS = chr(0x1f) 43*7330f729Sjoerg 44*7330f729SjoergCOMPILER_WRAPPER_CC = 'intercept-cc' 45*7330f729SjoergCOMPILER_WRAPPER_CXX = 'intercept-c++' 46*7330f729SjoergTRACE_FILE_EXTENSION = '.cmd' # same as in ear.c 47*7330f729SjoergWRAPPER_ONLY_PLATFORMS = frozenset({'win32', 'cygwin'}) 48*7330f729Sjoerg 49*7330f729Sjoerg 50*7330f729Sjoerg@command_entry_point 51*7330f729Sjoergdef intercept_build(): 52*7330f729Sjoerg """ Entry point for 'intercept-build' command. """ 53*7330f729Sjoerg 54*7330f729Sjoerg args = parse_args_for_intercept_build() 55*7330f729Sjoerg return capture(args) 56*7330f729Sjoerg 57*7330f729Sjoerg 58*7330f729Sjoergdef capture(args): 59*7330f729Sjoerg """ The entry point of build command interception. """ 60*7330f729Sjoerg 61*7330f729Sjoerg def post_processing(commands): 62*7330f729Sjoerg """ To make a compilation database, it needs to filter out commands 63*7330f729Sjoerg which are not compiler calls. Needs to find the source file name 64*7330f729Sjoerg from the arguments. And do shell escaping on the command. 65*7330f729Sjoerg 66*7330f729Sjoerg To support incremental builds, it is desired to read elements from 67*7330f729Sjoerg an existing compilation database from a previous run. These elements 68*7330f729Sjoerg shall be merged with the new elements. """ 69*7330f729Sjoerg 70*7330f729Sjoerg # create entries from the current run 71*7330f729Sjoerg current = itertools.chain.from_iterable( 72*7330f729Sjoerg # creates a sequence of entry generators from an exec, 73*7330f729Sjoerg format_entry(command) for command in commands) 74*7330f729Sjoerg # read entries from previous run 75*7330f729Sjoerg if 'append' in args and args.append and os.path.isfile(args.cdb): 76*7330f729Sjoerg with open(args.cdb) as handle: 77*7330f729Sjoerg previous = iter(json.load(handle)) 78*7330f729Sjoerg else: 79*7330f729Sjoerg previous = iter([]) 80*7330f729Sjoerg # filter out duplicate entries from both 81*7330f729Sjoerg duplicate = duplicate_check(entry_hash) 82*7330f729Sjoerg return (entry 83*7330f729Sjoerg for entry in itertools.chain(previous, current) 84*7330f729Sjoerg if os.path.exists(entry['file']) and not duplicate(entry)) 85*7330f729Sjoerg 86*7330f729Sjoerg with TemporaryDirectory(prefix='intercept-') as tmp_dir: 87*7330f729Sjoerg # run the build command 88*7330f729Sjoerg environment = setup_environment(args, tmp_dir) 89*7330f729Sjoerg exit_code = run_build(args.build, env=environment) 90*7330f729Sjoerg # read the intercepted exec calls 91*7330f729Sjoerg exec_traces = itertools.chain.from_iterable( 92*7330f729Sjoerg parse_exec_trace(os.path.join(tmp_dir, filename)) 93*7330f729Sjoerg for filename in sorted(glob.iglob(os.path.join(tmp_dir, '*.cmd')))) 94*7330f729Sjoerg # do post processing 95*7330f729Sjoerg entries = post_processing(exec_traces) 96*7330f729Sjoerg # dump the compilation database 97*7330f729Sjoerg with open(args.cdb, 'w+') as handle: 98*7330f729Sjoerg json.dump(list(entries), handle, sort_keys=True, indent=4) 99*7330f729Sjoerg return exit_code 100*7330f729Sjoerg 101*7330f729Sjoerg 102*7330f729Sjoergdef setup_environment(args, destination): 103*7330f729Sjoerg """ Sets up the environment for the build command. 104*7330f729Sjoerg 105*7330f729Sjoerg It sets the required environment variables and execute the given command. 106*7330f729Sjoerg The exec calls will be logged by the 'libear' preloaded library or by the 107*7330f729Sjoerg 'wrapper' programs. """ 108*7330f729Sjoerg 109*7330f729Sjoerg c_compiler = args.cc if 'cc' in args else 'cc' 110*7330f729Sjoerg cxx_compiler = args.cxx if 'cxx' in args else 'c++' 111*7330f729Sjoerg 112*7330f729Sjoerg libear_path = None if args.override_compiler or is_preload_disabled( 113*7330f729Sjoerg sys.platform) else build_libear(c_compiler, destination) 114*7330f729Sjoerg 115*7330f729Sjoerg environment = dict(os.environ) 116*7330f729Sjoerg environment.update({'INTERCEPT_BUILD_TARGET_DIR': destination}) 117*7330f729Sjoerg 118*7330f729Sjoerg if not libear_path: 119*7330f729Sjoerg logging.debug('intercept gonna use compiler wrappers') 120*7330f729Sjoerg environment.update(wrapper_environment(args)) 121*7330f729Sjoerg environment.update({ 122*7330f729Sjoerg 'CC': COMPILER_WRAPPER_CC, 123*7330f729Sjoerg 'CXX': COMPILER_WRAPPER_CXX 124*7330f729Sjoerg }) 125*7330f729Sjoerg elif sys.platform == 'darwin': 126*7330f729Sjoerg logging.debug('intercept gonna preload libear on OSX') 127*7330f729Sjoerg environment.update({ 128*7330f729Sjoerg 'DYLD_INSERT_LIBRARIES': libear_path, 129*7330f729Sjoerg 'DYLD_FORCE_FLAT_NAMESPACE': '1' 130*7330f729Sjoerg }) 131*7330f729Sjoerg else: 132*7330f729Sjoerg logging.debug('intercept gonna preload libear on UNIX') 133*7330f729Sjoerg environment.update({'LD_PRELOAD': libear_path}) 134*7330f729Sjoerg 135*7330f729Sjoerg return environment 136*7330f729Sjoerg 137*7330f729Sjoerg 138*7330f729Sjoerg@command_entry_point 139*7330f729Sjoergdef intercept_compiler_wrapper(): 140*7330f729Sjoerg """ Entry point for `intercept-cc` and `intercept-c++`. """ 141*7330f729Sjoerg 142*7330f729Sjoerg return compiler_wrapper(intercept_compiler_wrapper_impl) 143*7330f729Sjoerg 144*7330f729Sjoerg 145*7330f729Sjoergdef intercept_compiler_wrapper_impl(_, execution): 146*7330f729Sjoerg """ Implement intercept compiler wrapper functionality. 147*7330f729Sjoerg 148*7330f729Sjoerg It does generate execution report into target directory. 149*7330f729Sjoerg The target directory name is from environment variables. """ 150*7330f729Sjoerg 151*7330f729Sjoerg message_prefix = 'execution report might be incomplete: %s' 152*7330f729Sjoerg 153*7330f729Sjoerg target_dir = os.getenv('INTERCEPT_BUILD_TARGET_DIR') 154*7330f729Sjoerg if not target_dir: 155*7330f729Sjoerg logging.warning(message_prefix, 'missing target directory') 156*7330f729Sjoerg return 157*7330f729Sjoerg # write current execution info to the pid file 158*7330f729Sjoerg try: 159*7330f729Sjoerg target_file_name = str(os.getpid()) + TRACE_FILE_EXTENSION 160*7330f729Sjoerg target_file = os.path.join(target_dir, target_file_name) 161*7330f729Sjoerg logging.debug('writing execution report to: %s', target_file) 162*7330f729Sjoerg write_exec_trace(target_file, execution) 163*7330f729Sjoerg except IOError: 164*7330f729Sjoerg logging.warning(message_prefix, 'io problem') 165*7330f729Sjoerg 166*7330f729Sjoerg 167*7330f729Sjoergdef write_exec_trace(filename, entry): 168*7330f729Sjoerg """ Write execution report file. 169*7330f729Sjoerg 170*7330f729Sjoerg This method shall be sync with the execution report writer in interception 171*7330f729Sjoerg library. The entry in the file is a JSON objects. 172*7330f729Sjoerg 173*7330f729Sjoerg :param filename: path to the output execution trace file, 174*7330f729Sjoerg :param entry: the Execution object to append to that file. """ 175*7330f729Sjoerg 176*7330f729Sjoerg with open(filename, 'ab') as handler: 177*7330f729Sjoerg pid = str(entry.pid) 178*7330f729Sjoerg command = US.join(entry.cmd) + US 179*7330f729Sjoerg content = RS.join([pid, pid, 'wrapper', entry.cwd, command]) + GS 180*7330f729Sjoerg handler.write(content.encode('utf-8')) 181*7330f729Sjoerg 182*7330f729Sjoerg 183*7330f729Sjoergdef parse_exec_trace(filename): 184*7330f729Sjoerg """ Parse the file generated by the 'libear' preloaded library. 185*7330f729Sjoerg 186*7330f729Sjoerg Given filename points to a file which contains the basic report 187*7330f729Sjoerg generated by the interception library or wrapper command. A single 188*7330f729Sjoerg report file _might_ contain multiple process creation info. """ 189*7330f729Sjoerg 190*7330f729Sjoerg logging.debug('parse exec trace file: %s', filename) 191*7330f729Sjoerg with open(filename, 'r') as handler: 192*7330f729Sjoerg content = handler.read() 193*7330f729Sjoerg for group in filter(bool, content.split(GS)): 194*7330f729Sjoerg records = group.split(RS) 195*7330f729Sjoerg yield { 196*7330f729Sjoerg 'pid': records[0], 197*7330f729Sjoerg 'ppid': records[1], 198*7330f729Sjoerg 'function': records[2], 199*7330f729Sjoerg 'directory': records[3], 200*7330f729Sjoerg 'command': records[4].split(US)[:-1] 201*7330f729Sjoerg } 202*7330f729Sjoerg 203*7330f729Sjoerg 204*7330f729Sjoergdef format_entry(exec_trace): 205*7330f729Sjoerg """ Generate the desired fields for compilation database entries. """ 206*7330f729Sjoerg 207*7330f729Sjoerg def abspath(cwd, name): 208*7330f729Sjoerg """ Create normalized absolute path from input filename. """ 209*7330f729Sjoerg fullname = name if os.path.isabs(name) else os.path.join(cwd, name) 210*7330f729Sjoerg return os.path.normpath(fullname) 211*7330f729Sjoerg 212*7330f729Sjoerg logging.debug('format this command: %s', exec_trace['command']) 213*7330f729Sjoerg compilation = split_command(exec_trace['command']) 214*7330f729Sjoerg if compilation: 215*7330f729Sjoerg for source in compilation.files: 216*7330f729Sjoerg compiler = 'c++' if compilation.compiler == 'c++' else 'cc' 217*7330f729Sjoerg command = [compiler, '-c'] + compilation.flags + [source] 218*7330f729Sjoerg logging.debug('formated as: %s', command) 219*7330f729Sjoerg yield { 220*7330f729Sjoerg 'directory': exec_trace['directory'], 221*7330f729Sjoerg 'command': encode(command), 222*7330f729Sjoerg 'file': abspath(exec_trace['directory'], source) 223*7330f729Sjoerg } 224*7330f729Sjoerg 225*7330f729Sjoerg 226*7330f729Sjoergdef is_preload_disabled(platform): 227*7330f729Sjoerg """ Library-based interposition will fail silently if SIP is enabled, 228*7330f729Sjoerg so this should be detected. You can detect whether SIP is enabled on 229*7330f729Sjoerg Darwin by checking whether (1) there is a binary called 'csrutil' in 230*7330f729Sjoerg the path and, if so, (2) whether the output of executing 'csrutil status' 231*7330f729Sjoerg contains 'System Integrity Protection status: enabled'. 232*7330f729Sjoerg 233*7330f729Sjoerg :param platform: name of the platform (returned by sys.platform), 234*7330f729Sjoerg :return: True if library preload will fail by the dynamic linker. """ 235*7330f729Sjoerg 236*7330f729Sjoerg if platform in WRAPPER_ONLY_PLATFORMS: 237*7330f729Sjoerg return True 238*7330f729Sjoerg elif platform == 'darwin': 239*7330f729Sjoerg command = ['csrutil', 'status'] 240*7330f729Sjoerg pattern = re.compile(r'System Integrity Protection status:\s+enabled') 241*7330f729Sjoerg try: 242*7330f729Sjoerg return any(pattern.match(line) for line in run_command(command)) 243*7330f729Sjoerg except: 244*7330f729Sjoerg return False 245*7330f729Sjoerg else: 246*7330f729Sjoerg return False 247*7330f729Sjoerg 248*7330f729Sjoerg 249*7330f729Sjoergdef entry_hash(entry): 250*7330f729Sjoerg """ Implement unique hash method for compilation database entries. """ 251*7330f729Sjoerg 252*7330f729Sjoerg # For faster lookup in set filename is reverted 253*7330f729Sjoerg filename = entry['file'][::-1] 254*7330f729Sjoerg # For faster lookup in set directory is reverted 255*7330f729Sjoerg directory = entry['directory'][::-1] 256*7330f729Sjoerg # On OS X the 'cc' and 'c++' compilers are wrappers for 257*7330f729Sjoerg # 'clang' therefore both call would be logged. To avoid 258*7330f729Sjoerg # this the hash does not contain the first word of the 259*7330f729Sjoerg # command. 260*7330f729Sjoerg command = ' '.join(decode(entry['command'])[1:]) 261*7330f729Sjoerg 262*7330f729Sjoerg return '<>'.join([filename, directory, command]) 263