1# Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 2# See https://llvm.org/LICENSE.txt for license information. 3# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 4"""Extract IR for training. 5 6Extract IR for training, either from a compile_commands.json file produced by 7cmake, or a linker parameter list file. 8 9Only run with 10'python compiler_opt/tools/extract_ir.py ...' 11 12The compilation is assumed to have been performed with clang, using 13-fembed-bitcode=all passed to cc1 (i.e. pass clang -Xclang=-fembed-bitcode=all) 14 15In a distributed ThinLTO case, the compilation is assumed to have been performed 16specifying -mllvm -lto-embed-bitcode=post-merge-pre-opt. 17 18In a local ThinLTO case, the compilation is assumedto have been performed 19specifying -Wl,--save-temps=import -Wl,--thinlto-emit-index-files 20 21To change the logging verbosity, set the --verbosity flag to the desired level. 22Setting it to a specific level will enable all messages at that level and 23higher. Exact values can be found by invoking the script with --help. 24""" 25 26import argparse 27import json 28import logging 29 30from mlgo.corpus import extract_ir_lib 31 32 33def parse_args_and_run(): 34 parser = argparse.ArgumentParser( 35 description="A tool for making a corpus from build artifacts" 36 ) 37 parser.add_argument( 38 "--input", 39 type=str, 40 help="Input file or directory - either compile_commands.json, a linker " 41 "parameter list, or a path to a directory containing object files.", 42 ) 43 parser.add_argument( 44 "--input_type", 45 type=str, 46 help="Input file type - JSON, LLD params, directory, or bazel aquery.", 47 choices=["json", "params", "directory", "bazel_aquery"], 48 default="json", 49 nargs="?", 50 ) 51 parser.add_argument("--output_dir", type=str, help="Output directory") 52 parser.add_argument( 53 "--num_workers", 54 type=int, 55 help="Number of parallel works for objcopy. `None` for maximum available.", 56 default=None, 57 nargs="?", 58 ) 59 parser.add_argument( 60 "--llvm_objcopy_path", 61 type=str, 62 help="Path to llvm-objcopy", 63 default="llvm-objcopy", 64 nargs="?", 65 ) 66 parser.add_argument( 67 "--obj_base_dir", 68 type=str, 69 help="Base directory for object files. Defaults to current working dir.", 70 default="", 71 nargs="?", 72 ) 73 parser.add_argument( 74 "--cmd_filter", 75 type=str, 76 help="Include only those modules with a command line matching this regular " 77 "expression. Set it to None to not perform any filtering. Note that the " 78 "regular expression is applied independently for each separate command line " 79 "option. For example, ^-Oz$ will match Oz built binaries. This does not work " 80 "with thinlto_build=lld.", 81 default=None, 82 nargs="?", 83 ) 84 parser.add_argument( 85 "--thinlto_build", 86 type=str, 87 help="Set if the build was performed with either 'distributed' or 'local' " 88 "ThinLTO. This ensures the thinlto.bc files are also copied. The build is " 89 "assumed to have had -mllvm -lto-embed-bitcode=post-merge-pre-opt passed in " 90 "the distributed case or -Wl,--save-temps=import and " 91 "-Wl,--thinlto-emit-index-files passed in the local case", 92 choices=["distributed", "local"], 93 default=None, 94 nargs="?", 95 ) 96 parser.add_argument( 97 "--cmd_section_name", 98 type=str, 99 help="The section name passed to llvm-objcopy. For ELF object files, the " 100 "default .llvmcmd is correct. For Mach-O object files, one should use " 101 "something like __LLVM,__cmdline", 102 default=".llvmcmd", 103 nargs="?", 104 ) 105 parser.add_argument( 106 "--bitcode_section_name", 107 type=str, 108 help="The section name passed to llvm-objcopy. For ELF object files, the " 109 "default .llvmbc is correct. For Mach-O object files, one should use " 110 "__LLVM,__bitcode", 111 default=".llvmbc", 112 nargs="?", 113 ) 114 # TODO(#107898): Refactor this into a common location. 115 parser.add_argument( 116 "--verbosity", 117 type=str, 118 help="The verbosity level to use for logging", 119 default="INFO", 120 nargs="?", 121 choices=["DEBUG", "INFO", "WARNING", "ERROR"], 122 ) 123 args = parser.parse_args() 124 main(args) 125 126 127def main(args): 128 logging.basicConfig(level=args.verbosity) 129 130 objs = [] 131 if args.input is not None and args.thinlto_build == "local": 132 raise ValueError("--thinlto_build=local cannot be run with --input") 133 if args.input is None: 134 if args.thinlto_build != "local": 135 raise ValueError("--input or --thinlto_build=local must be provided") 136 objs = extract_ir_lib.load_for_lld_thinlto(args.obj_base_dir, args.output_dir) 137 elif args.input_type == "json": 138 with open(args.input, encoding="utf-8") as f: 139 objs = extract_ir_lib.load_from_compile_commands( 140 json.load(f), args.output_dir 141 ) 142 elif args.input_type == "params": 143 if not args.obj_base_dir: 144 logging.info( 145 "-obj_base_dir is unspecified, assuming current directory. " 146 "If no objects are found, use this option to specify the root " 147 "directory for the object file paths in the input file." 148 ) 149 with open(args.input, encoding="utf-8") as f: 150 objs = extract_ir_lib.load_from_lld_params( 151 [l.strip() for l in f.readlines()], args.obj_base_dir, args.output_dir 152 ) 153 elif args.input_type == "directory": 154 logging.warning( 155 "Using the directory input is only recommended if the build system " 156 "your project uses does not support any structured output that " 157 "ml-compiler-opt understands. If your build system provides a " 158 "structured compilation database, use that instead" 159 ) 160 objs = extract_ir_lib.load_from_directory(args.input, args.output_dir) 161 elif args.input_type == "bazel_aquery": 162 with open(args.input, encoding="utf-8") as aquery_json_handle: 163 objs = extract_ir_lib.load_bazel_aquery( 164 json.load(aquery_json_handle), args.obj_base_dir, args.output_dir 165 ) 166 else: 167 logging.error("Unknown input type: %s", args.input_type) 168 169 relative_output_paths = extract_ir_lib.run_extraction( 170 objs, 171 args.num_workers, 172 args.llvm_objcopy_path, 173 args.cmd_filter, 174 args.thinlto_build, 175 args.cmd_section_name, 176 args.bitcode_section_name, 177 ) 178 179 extract_ir_lib.write_corpus_manifest( 180 args.thinlto_build, relative_output_paths, args.output_dir 181 ) 182 183 logging.info( 184 "Converted %d files out of %d", 185 len(objs) - relative_output_paths.count(None), 186 len(objs), 187 ) 188 189 190if __name__ == "__main__": 191 parse_args_and_run() 192