# Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. # See https://llvm.org/LICENSE.txt for license information. # SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception """Extract IR for training. Extract IR for training, either from a compile_commands.json file produced by cmake, or a linker parameter list file. Only run with 'python compiler_opt/tools/extract_ir.py ...' The compilation is assumed to have been performed with clang, using -fembed-bitcode=all passed to cc1 (i.e. pass clang -Xclang=-fembed-bitcode=all) In a distributed ThinLTO case, the compilation is assumed to have been performed specifying -mllvm -lto-embed-bitcode=post-merge-pre-opt. In a local ThinLTO case, the compilation is assumedto have been performed specifying -Wl,--save-temps=import -Wl,--thinlto-emit-index-files To change the logging verbosity, set the --verbosity flag to the desired level. Setting it to a specific level will enable all messages at that level and higher. Exact values can be found by invoking the script with --help. """ import argparse import json import logging from mlgo.corpus import extract_ir_lib def parse_args_and_run(): parser = argparse.ArgumentParser( description="A tool for making a corpus from build artifacts" ) parser.add_argument( "--input", type=str, help="Input file or directory - either compile_commands.json, a linker " "parameter list, or a path to a directory containing object files.", ) parser.add_argument( "--input_type", type=str, help="Input file type - JSON, LLD params, directory, or bazel aquery.", choices=["json", "params", "directory", "bazel_aquery"], default="json", nargs="?", ) parser.add_argument("--output_dir", type=str, help="Output directory") parser.add_argument( "--num_workers", type=int, help="Number of parallel works for objcopy. `None` for maximum available.", default=None, nargs="?", ) parser.add_argument( "--llvm_objcopy_path", type=str, help="Path to llvm-objcopy", default="llvm-objcopy", nargs="?", ) parser.add_argument( "--obj_base_dir", type=str, help="Base directory for object files. Defaults to current working dir.", default="", nargs="?", ) parser.add_argument( "--cmd_filter", type=str, help="Include only those modules with a command line matching this regular " "expression. Set it to None to not perform any filtering. Note that the " "regular expression is applied independently for each separate command line " "option. For example, ^-Oz$ will match Oz built binaries. This does not work " "with thinlto_build=lld.", default=None, nargs="?", ) parser.add_argument( "--thinlto_build", type=str, help="Set if the build was performed with either 'distributed' or 'local' " "ThinLTO. This ensures the thinlto.bc files are also copied. The build is " "assumed to have had -mllvm -lto-embed-bitcode=post-merge-pre-opt passed in " "the distributed case or -Wl,--save-temps=import and " "-Wl,--thinlto-emit-index-files passed in the local case", choices=["distributed", "local"], default=None, nargs="?", ) parser.add_argument( "--cmd_section_name", type=str, help="The section name passed to llvm-objcopy. For ELF object files, the " "default .llvmcmd is correct. For Mach-O object files, one should use " "something like __LLVM,__cmdline", default=".llvmcmd", nargs="?", ) parser.add_argument( "--bitcode_section_name", type=str, help="The section name passed to llvm-objcopy. For ELF object files, the " "default .llvmbc is correct. For Mach-O object files, one should use " "__LLVM,__bitcode", default=".llvmbc", nargs="?", ) # TODO(#107898): Refactor this into a common location. parser.add_argument( "--verbosity", type=str, help="The verbosity level to use for logging", default="INFO", nargs="?", choices=["DEBUG", "INFO", "WARNING", "ERROR"], ) args = parser.parse_args() main(args) def main(args): logging.basicConfig(level=args.verbosity) objs = [] if args.input is not None and args.thinlto_build == "local": raise ValueError("--thinlto_build=local cannot be run with --input") if args.input is None: if args.thinlto_build != "local": raise ValueError("--input or --thinlto_build=local must be provided") objs = extract_ir_lib.load_for_lld_thinlto(args.obj_base_dir, args.output_dir) elif args.input_type == "json": with open(args.input, encoding="utf-8") as f: objs = extract_ir_lib.load_from_compile_commands( json.load(f), args.output_dir ) elif args.input_type == "params": if not args.obj_base_dir: logging.info( "-obj_base_dir is unspecified, assuming current directory. " "If no objects are found, use this option to specify the root " "directory for the object file paths in the input file." ) with open(args.input, encoding="utf-8") as f: objs = extract_ir_lib.load_from_lld_params( [l.strip() for l in f.readlines()], args.obj_base_dir, args.output_dir ) elif args.input_type == "directory": logging.warning( "Using the directory input is only recommended if the build system " "your project uses does not support any structured output that " "ml-compiler-opt understands. If your build system provides a " "structured compilation database, use that instead" ) objs = extract_ir_lib.load_from_directory(args.input, args.output_dir) elif args.input_type == "bazel_aquery": with open(args.input, encoding="utf-8") as aquery_json_handle: objs = extract_ir_lib.load_bazel_aquery( json.load(aquery_json_handle), args.obj_base_dir, args.output_dir ) else: logging.error("Unknown input type: %s", args.input_type) relative_output_paths = extract_ir_lib.run_extraction( objs, args.num_workers, args.llvm_objcopy_path, args.cmd_filter, args.thinlto_build, args.cmd_section_name, args.bitcode_section_name, ) extract_ir_lib.write_corpus_manifest( args.thinlto_build, relative_output_paths, args.output_dir ) logging.info( "Converted %d files out of %d", len(objs) - relative_output_paths.count(None), len(objs), ) if __name__ == "__main__": parse_args_and_run()