1# Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 2# See https://llvm.org/LICENSE.txt for license information. 3# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 4"""Library functions for IR extraction.""" 5 6import os 7import pathlib 8import re 9import shutil 10import subprocess 11import multiprocessing 12import functools 13import json 14import logging 15 16from typing import Dict, List, Optional 17 18_UNSPECIFIED_OVERRIDE = ["<UNSPECIFIED>"] 19 20 21# TODO(ml-compiler-opt): maybe we can also convert here the cmdline file,from a 22# \0 - separated list of strings, to a \n one. 23def should_include_module(cmdline: str, match_regexp: Optional[str]) -> bool: 24 """Determine if the module should be included.""" 25 if match_regexp is None: 26 return True 27 lines = cmdline.split("\0") 28 return any(len(re.findall(match_regexp, l)) for l in lines) 29 30 31def get_thinlto_index(cmdline: str, basedir: str) -> Optional[str]: 32 opts = cmdline.split("\0") 33 for option in opts: 34 if option.startswith("-fthinlto-index"): 35 return os.path.join(basedir, option.split("=")[1]) 36 return None 37 38 39class TrainingIRExtractor: 40 """IR and command line extraction from an object file.""" 41 42 def __init__(self, obj_relative_path, output_base_dir, obj_base_dir=None): 43 """Set up a TrainingIRExtractor. 44 45 Args: 46 obj_relative_path: relative path to the input object file. It will be also 47 used to construct the absolute path of the output IR and cmd files, by 48 appending it to output_base_dir. 49 output_base_dir: the directory under which the output will be produced. 50 obj_base_dir: the base directory for all the input object files. 51 """ 52 self._obj_relative_path = obj_relative_path 53 self._output_base_dir = output_base_dir 54 self._obj_base_dir = obj_base_dir if obj_base_dir is not None else "" 55 56 def obj_base_dir(self): 57 return self._obj_base_dir 58 59 def output_base_dir(self): 60 return self._output_base_dir 61 62 def relative_output_path(self): 63 return self._obj_relative_path 64 65 def input_obj(self): 66 return os.path.join(self.obj_base_dir(), self._obj_relative_path) 67 68 def lld_src_bc(self): 69 # .3.import.bc is the suffix attached to post-merge-pre-opt ('postimport') 70 # IR bitcode saved by lld. It is hardcoded into lld. 71 return os.path.join( 72 self._obj_base_dir, self._obj_relative_path + ".3.import.bc" 73 ) 74 75 def lld_src_thinlto(self): 76 return os.path.join(self._obj_base_dir, self._obj_relative_path + ".thinlto.bc") 77 78 def dest_dir(self): 79 return os.path.join( 80 self.output_base_dir(), os.path.dirname(self._obj_relative_path) 81 ) 82 83 def module_name(self): 84 return os.path.basename(self._obj_relative_path) 85 86 def cmd_file(self): 87 return os.path.join(self.dest_dir(), self.module_name() + ".cmd") 88 89 def bc_file(self): 90 return os.path.join(self.dest_dir(), self.module_name() + ".bc") 91 92 def thinlto_index_file(self): 93 return os.path.join(self.dest_dir(), self.module_name() + ".thinlto.bc") 94 95 def _get_extraction_cmd_command( 96 self, llvm_objcopy_path: str, cmd_section_name: str 97 ): 98 """Get llvm-objcopy and process args to a produce a command string that, 99 when invoked, will extract the cmd section info ths self.cmd_file() file. 100 """ 101 return [ 102 llvm_objcopy_path, 103 "--dump-section=" + cmd_section_name + "=" + self.cmd_file(), 104 self.input_obj(), 105 "/dev/null", 106 ] 107 108 def _get_extraction_bc_command( 109 self, llvm_objcopy_path: str, bitcode_section_name: str 110 ): 111 """Gets llvm-objcopy and process args to produce a command string that, 112 when invoked, will extract the bitcode section into the self.bc_file() 113 file. 114 """ 115 return [ 116 llvm_objcopy_path, 117 "--dump-section=" + bitcode_section_name + "=" + self.bc_file(), 118 self.input_obj(), 119 "/dev/null", 120 ] 121 122 def _extract_clang_artifacts( 123 self, 124 llvm_objcopy_path: str, 125 cmd_filter: str, 126 is_thinlto: bool, 127 cmd_section_name: str, 128 bitcode_section_name: str, 129 ) -> Optional[str]: 130 """Run llvm-objcopy to extract the .bc and command line.""" 131 if not os.path.exists(self.input_obj()): 132 logging.info("%s does not exist.", self.input_obj()) 133 return None 134 os.makedirs(self.dest_dir(), exist_ok=True) 135 try: 136 subprocess.check_output( 137 self._get_extraction_cmd_command(llvm_objcopy_path, cmd_section_name), 138 stderr=subprocess.STDOUT, 139 encoding="utf-8", 140 ) 141 if cmd_filter is not None or is_thinlto: 142 with open(self.cmd_file(), encoding="utf-8") as f: 143 lines = f.readlines() 144 assert len(lines) == 1 145 cmdline = lines[0] 146 if not should_include_module(cmdline, cmd_filter): 147 logging.info( 148 "Excluding module %s because it does not match the filter", 149 self.input_obj(), 150 ) 151 os.remove(self.cmd_file()) 152 return None 153 if is_thinlto: 154 index_file = get_thinlto_index(cmdline, self.obj_base_dir()) 155 shutil.copy(index_file, self.thinlto_index_file()) 156 157 subprocess.check_output( 158 self._get_extraction_bc_command( 159 llvm_objcopy_path, bitcode_section_name 160 ), 161 stderr=subprocess.STDOUT, 162 encoding="utf-8", 163 ) 164 except subprocess.CalledProcessError as e: 165 # This may happen if .o file was build from asm (.S source). 166 logging.warning("%s was not processed: %s", self.input_obj(), e) 167 logging.info(e.output) 168 return None 169 assert ( 170 os.path.exists(self.cmd_file()) 171 and os.path.exists(self.bc_file()) 172 and (not is_thinlto or os.path.exists(self.thinlto_index_file())) 173 ) 174 return self.relative_output_path() 175 176 def _extract_lld_artifacts(self) -> Optional[str]: 177 """Extract the .bc file with ThinLTO index from an lld ThinLTO invocation.""" 178 if not os.path.exists(self.lld_src_bc()): 179 logging.info("%s does not exist.", self.lld_src_bc()) 180 return None 181 if not os.path.exists(self.lld_src_thinlto()): 182 logging.info("%s does not exist.", self.lld_src_thinlto()) 183 return None 184 os.makedirs(self.dest_dir(), exist_ok=True) 185 186 # Copy over the files 187 shutil.copy(self.lld_src_bc(), self.bc_file()) 188 shutil.copy(self.lld_src_thinlto(), self.thinlto_index_file()) 189 190 assert os.path.exists(self.bc_file()) 191 assert os.path.exists(self.thinlto_index_file()) 192 return self._obj_relative_path 193 194 def extract( 195 self, 196 llvm_objcopy_path: Optional[str] = None, 197 cmd_filter: Optional[str] = None, 198 thinlto_build: Optional[str] = None, 199 cmd_section_name: Optional[str] = ".llvmcmd", 200 bitcode_section_name: Optional[str] = ".llvmbc", 201 ) -> Optional[str]: 202 if thinlto_build == "local": 203 return self._extract_lld_artifacts() 204 return self._extract_clang_artifacts( 205 llvm_objcopy_path=llvm_objcopy_path, 206 cmd_filter=cmd_filter, 207 is_thinlto=thinlto_build == "distributed", 208 cmd_section_name=cmd_section_name, 209 bitcode_section_name=bitcode_section_name, 210 ) 211 212 213def convert_compile_command_to_objectfile( 214 command: Dict[str, str], output_dir: str 215) -> Optional[TrainingIRExtractor]: 216 obj_base_dir = command["directory"] 217 if "arguments" in command: 218 cmd_parts = command["arguments"] 219 elif "command" in command: 220 cmd_parts = command["command"].split() 221 else: 222 logging.info("compile_commands element has no command and arguments") 223 return None 224 225 try: 226 obj_index = cmd_parts.index("-o") + 1 227 except ValueError: 228 # This could happen if there are non-clang commands in compile_commands.json 229 logging.info("Command has no -o option: %s", " ".join(cmd_parts)) 230 return None 231 obj_rel_path = cmd_parts[obj_index] 232 # TODO(mtrofin): is the obj_base_dir correct for thinlto index bc files? 233 return TrainingIRExtractor( 234 obj_relative_path=obj_rel_path, 235 output_base_dir=output_dir, 236 obj_base_dir=obj_base_dir, 237 ) 238 239 240def load_from_compile_commands( 241 json_array: List[Dict[str, str]], output_dir: str 242) -> List[TrainingIRExtractor]: 243 objs = [ 244 convert_compile_command_to_objectfile(cmd, output_dir) for cmd in json_array 245 ] 246 # Filter out None, in case there were non-clang commands in the .json 247 return [obj for obj in objs if obj is not None] 248 249 250def load_from_lld_params( 251 params_array: List[str], obj_base_dir: str, output_dir: str 252) -> List[TrainingIRExtractor]: 253 """Create an ObjectFile array based on lld's parameters.""" 254 # yank out -o and the output. After that, anything not starting with '-', and 255 # ending in a '.o', is an object file. 256 try: 257 minus_o_idx = params_array.index("-o") 258 del params_array[minus_o_idx : minus_o_idx + 2] 259 just_obj_paths = [ 260 o for o in params_array if not o.startswith("-") and o.endswith(".o") 261 ] 262 except ValueError: 263 logging.info("This params file does not have an explicit -o option.") 264 just_obj_paths = params_array 265 266 def make_obj(obj_file: str) -> TrainingIRExtractor: 267 return TrainingIRExtractor( 268 obj_relative_path=obj_file, 269 output_base_dir=output_dir, 270 obj_base_dir=obj_base_dir, 271 ) 272 273 return [make_obj(obj_file) for obj_file in just_obj_paths] 274 275 276def load_from_directory( 277 obj_base_dir: str, output_dir: str 278) -> List[TrainingIRExtractor]: 279 """Create an object file array by globbing an entire drectory. 280 281 Args: 282 obj_base_dir: The base build directory that all object files will be 283 written out as being relative to. 284 output_dir: The output directory where extracted .bc and .cmd files should 285 be placed. 286 """ 287 paths = [str(p) for p in pathlib.Path(obj_base_dir).glob("**/*.o")] 288 289 def make_spec(obj_file: str): 290 return TrainingIRExtractor( 291 obj_relative_path=os.path.relpath(obj_file, start=obj_base_dir), 292 output_base_dir=output_dir, 293 obj_base_dir=obj_base_dir, 294 ) 295 296 return [make_spec(path) for path in paths] 297 298 299def load_for_lld_thinlto( 300 obj_base_dir: str, output_dir: str 301) -> List[TrainingIRExtractor]: 302 # .3.import.bc is the suffix attached to post-merge-pre-opt ('postimport') 303 # IR bitcode saved by lld. It is hardcoded into lld. ThinLTO index files 304 # are also emitted next to the postimport bitcode, with the suffix 305 # .thinlto.bc instead 306 paths = [str(p) for p in pathlib.Path(obj_base_dir).glob("**/*.3.import.bc")] 307 308 def make_spec(obj_file: str): 309 return TrainingIRExtractor( 310 # Cut away .3.import.bc 311 obj_relative_path=os.path.relpath(obj_file, start=obj_base_dir)[:-12], 312 output_base_dir=output_dir, 313 obj_base_dir=obj_base_dir, 314 ) 315 316 return [make_spec(path) for path in paths] 317 318 319def load_bazel_aquery(aquery_json, obj_base_dir: str, output_dir: str): 320 """Creates an object file array by looking at the JSON output of bazel aquery. 321 322 Args: 323 aquery_json: The JSON-formatted output of the bazel aquery command for 324 the target of interest. The bazel aquery JSON should be a JSON 325 serialized version of the analysis.ActionGraphContainer proto. 326 https://github.com/bazelbuild/bazel/blob/master/src/main/protobuf/analysis_v2.proto 327 obj_base_dir: The base build directory that all object files will be 328 written out as arelative to. 329 output_dir: The output directory where extracted .bc and .cmd files should 330 be placed. 331 """ 332 linker_params = [] 333 334 for action_info in aquery_json["actions"]: 335 if action_info["mnemonic"] != "CppLink": 336 continue 337 linker_params = action_info["arguments"] 338 339 return load_from_lld_params(linker_params, obj_base_dir, output_dir) 340 341 342def run_extraction( 343 objs: List[TrainingIRExtractor], 344 num_workers: int, 345 llvm_objcopy_path: str, 346 cmd_filter: str, 347 thinlto_build: str, 348 cmd_section_name: str, 349 bitcode_section_name: str, 350): 351 """Extracts all specified object files into the corpus directory. 352 353 Args: 354 objs: A list of TrainingIRExtractor Objects that represent the object files 355 to extract bitcode/commands from. 356 num_workers: The number of parallel processes to spawn to run the 357 extraction. 358 llvm_objcopy_path: The path to the llvm-objcopy to use for dumping sections. 359 cmd_filter: A regular expression that is used to select for compilations 360 performed with specific flags. If you want to include all compilations, 361 set this to None. 362 thinlto_build: Whether or not this is a ThinLTO build, and if so, the type. 363 Set this to None if the build was not done with ThinLTO. 364 cmd_section_name: The name of the command line section created by the 365 bitcode embedding. 366 bitcode_section_name: The name of the bitcode section created by the 367 bitcode embedding. 368 """ 369 extract_artifacts = functools.partial( 370 TrainingIRExtractor.extract, 371 llvm_objcopy_path=llvm_objcopy_path, 372 cmd_filter=cmd_filter, 373 thinlto_build=thinlto_build, 374 cmd_section_name=cmd_section_name, 375 bitcode_section_name=bitcode_section_name, 376 ) 377 378 with multiprocessing.Pool(num_workers) as pool: 379 relative_output_paths = pool.map(extract_artifacts, objs) 380 pool.close() 381 pool.join() 382 return relative_output_paths 383 384 385def write_corpus_manifest( 386 thinlto_build: str, relative_output_paths: List[str], output_dir: str 387): 388 """Writes a corpus_manifest.json containing all necessary information about 389 the corpus. 390 391 Args: 392 thinlto_build: Whether or not the build was done with ThinLTO and if so, 393 what kind of ThinLTO. Set this to none if the build was not performed with 394 ThinLTO. 395 relative_output_paths: The relative (to the corpus directory) output paths 396 of all the bitcode files that should be placed in the corpus manifest 397 output_dir: The corpus directory where the corpus manifest should be 398 placed. 399 """ 400 # This comes first rather than later so global_command_override is at the top 401 # of the .json after being written 402 if thinlto_build == "local": 403 corpus_description = {"global_command_override": _UNSPECIFIED_OVERRIDE} 404 else: 405 corpus_description = {} 406 407 corpus_description.update( 408 { 409 "has_thinlto": thinlto_build is not None, 410 "modules": [path for path in relative_output_paths if path is not None], 411 } 412 ) 413 414 with open( 415 os.path.join(output_dir, "corpus_description.json"), "w", encoding="utf-8" 416 ) as f: 417 json.dump(corpus_description, f, indent=2) 418