xref: /llvm-project/llvm/utils/mlgo-utils/mlgo/corpus/extract_ir_lib.py (revision 4134b33c6a362cb462b335177d6d9e8235f04309)
1# Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
2# See https://llvm.org/LICENSE.txt for license information.
3# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
4"""Library functions for IR extraction."""
5
6import os
7import pathlib
8import re
9import shutil
10import subprocess
11import multiprocessing
12import functools
13import json
14import logging
15
16from typing import Dict, List, Optional
17
18_UNSPECIFIED_OVERRIDE = ["<UNSPECIFIED>"]
19
20
21# TODO(ml-compiler-opt): maybe we can also convert here the cmdline file,from a
22# \0 - separated list of strings, to a \n one.
23def should_include_module(cmdline: str, match_regexp: Optional[str]) -> bool:
24    """Determine if the module should be included."""
25    if match_regexp is None:
26        return True
27    lines = cmdline.split("\0")
28    return any(len(re.findall(match_regexp, l)) for l in lines)
29
30
31def get_thinlto_index(cmdline: str, basedir: str) -> Optional[str]:
32    opts = cmdline.split("\0")
33    for option in opts:
34        if option.startswith("-fthinlto-index"):
35            return os.path.join(basedir, option.split("=")[1])
36    return None
37
38
39class TrainingIRExtractor:
40    """IR and command line extraction from an object file."""
41
42    def __init__(self, obj_relative_path, output_base_dir, obj_base_dir=None):
43        """Set up a TrainingIRExtractor.
44
45        Args:
46          obj_relative_path: relative path to the input object file. It will be also
47            used to construct the absolute path of the output IR and cmd files, by
48            appending it to output_base_dir.
49          output_base_dir: the directory under which the output will be produced.
50          obj_base_dir: the base directory for all the input object files.
51        """
52        self._obj_relative_path = obj_relative_path
53        self._output_base_dir = output_base_dir
54        self._obj_base_dir = obj_base_dir if obj_base_dir is not None else ""
55
56    def obj_base_dir(self):
57        return self._obj_base_dir
58
59    def output_base_dir(self):
60        return self._output_base_dir
61
62    def relative_output_path(self):
63        return self._obj_relative_path
64
65    def input_obj(self):
66        return os.path.join(self.obj_base_dir(), self._obj_relative_path)
67
68    def lld_src_bc(self):
69        # .3.import.bc is the suffix attached to post-merge-pre-opt ('postimport')
70        # IR bitcode saved by lld. It is hardcoded into lld.
71        return os.path.join(
72            self._obj_base_dir, self._obj_relative_path + ".3.import.bc"
73        )
74
75    def lld_src_thinlto(self):
76        return os.path.join(self._obj_base_dir, self._obj_relative_path + ".thinlto.bc")
77
78    def dest_dir(self):
79        return os.path.join(
80            self.output_base_dir(), os.path.dirname(self._obj_relative_path)
81        )
82
83    def module_name(self):
84        return os.path.basename(self._obj_relative_path)
85
86    def cmd_file(self):
87        return os.path.join(self.dest_dir(), self.module_name() + ".cmd")
88
89    def bc_file(self):
90        return os.path.join(self.dest_dir(), self.module_name() + ".bc")
91
92    def thinlto_index_file(self):
93        return os.path.join(self.dest_dir(), self.module_name() + ".thinlto.bc")
94
95    def _get_extraction_cmd_command(
96        self, llvm_objcopy_path: str, cmd_section_name: str
97    ):
98        """Get llvm-objcopy and process args to a produce a command string that,
99        when invoked, will extract the cmd section info ths self.cmd_file() file.
100        """
101        return [
102            llvm_objcopy_path,
103            "--dump-section=" + cmd_section_name + "=" + self.cmd_file(),
104            self.input_obj(),
105            "/dev/null",
106        ]
107
108    def _get_extraction_bc_command(
109        self, llvm_objcopy_path: str, bitcode_section_name: str
110    ):
111        """Gets llvm-objcopy and process args to produce a command string that,
112        when invoked, will extract the bitcode section into the self.bc_file()
113        file.
114        """
115        return [
116            llvm_objcopy_path,
117            "--dump-section=" + bitcode_section_name + "=" + self.bc_file(),
118            self.input_obj(),
119            "/dev/null",
120        ]
121
122    def _extract_clang_artifacts(
123        self,
124        llvm_objcopy_path: str,
125        cmd_filter: str,
126        is_thinlto: bool,
127        cmd_section_name: str,
128        bitcode_section_name: str,
129    ) -> Optional[str]:
130        """Run llvm-objcopy to extract the .bc and command line."""
131        if not os.path.exists(self.input_obj()):
132            logging.info("%s does not exist.", self.input_obj())
133            return None
134        os.makedirs(self.dest_dir(), exist_ok=True)
135        try:
136            subprocess.check_output(
137                self._get_extraction_cmd_command(llvm_objcopy_path, cmd_section_name),
138                stderr=subprocess.STDOUT,
139                encoding="utf-8",
140            )
141            if cmd_filter is not None or is_thinlto:
142                with open(self.cmd_file(), encoding="utf-8") as f:
143                    lines = f.readlines()
144                assert len(lines) == 1
145                cmdline = lines[0]
146                if not should_include_module(cmdline, cmd_filter):
147                    logging.info(
148                        "Excluding module %s because it does not match the filter",
149                        self.input_obj(),
150                    )
151                    os.remove(self.cmd_file())
152                    return None
153                if is_thinlto:
154                    index_file = get_thinlto_index(cmdline, self.obj_base_dir())
155                    shutil.copy(index_file, self.thinlto_index_file())
156
157            subprocess.check_output(
158                self._get_extraction_bc_command(
159                    llvm_objcopy_path, bitcode_section_name
160                ),
161                stderr=subprocess.STDOUT,
162                encoding="utf-8",
163            )
164        except subprocess.CalledProcessError as e:
165            # This may happen if  .o file was build from asm (.S source).
166            logging.warning("%s was not processed: %s", self.input_obj(), e)
167            logging.info(e.output)
168            return None
169        assert (
170            os.path.exists(self.cmd_file())
171            and os.path.exists(self.bc_file())
172            and (not is_thinlto or os.path.exists(self.thinlto_index_file()))
173        )
174        return self.relative_output_path()
175
176    def _extract_lld_artifacts(self) -> Optional[str]:
177        """Extract the .bc file with ThinLTO index from an lld ThinLTO invocation."""
178        if not os.path.exists(self.lld_src_bc()):
179            logging.info("%s does not exist.", self.lld_src_bc())
180            return None
181        if not os.path.exists(self.lld_src_thinlto()):
182            logging.info("%s does not exist.", self.lld_src_thinlto())
183            return None
184        os.makedirs(self.dest_dir(), exist_ok=True)
185
186        # Copy over the files
187        shutil.copy(self.lld_src_bc(), self.bc_file())
188        shutil.copy(self.lld_src_thinlto(), self.thinlto_index_file())
189
190        assert os.path.exists(self.bc_file())
191        assert os.path.exists(self.thinlto_index_file())
192        return self._obj_relative_path
193
194    def extract(
195        self,
196        llvm_objcopy_path: Optional[str] = None,
197        cmd_filter: Optional[str] = None,
198        thinlto_build: Optional[str] = None,
199        cmd_section_name: Optional[str] = ".llvmcmd",
200        bitcode_section_name: Optional[str] = ".llvmbc",
201    ) -> Optional[str]:
202        if thinlto_build == "local":
203            return self._extract_lld_artifacts()
204        return self._extract_clang_artifacts(
205            llvm_objcopy_path=llvm_objcopy_path,
206            cmd_filter=cmd_filter,
207            is_thinlto=thinlto_build == "distributed",
208            cmd_section_name=cmd_section_name,
209            bitcode_section_name=bitcode_section_name,
210        )
211
212
213def convert_compile_command_to_objectfile(
214    command: Dict[str, str], output_dir: str
215) -> Optional[TrainingIRExtractor]:
216    obj_base_dir = command["directory"]
217    if "arguments" in command:
218        cmd_parts = command["arguments"]
219    elif "command" in command:
220        cmd_parts = command["command"].split()
221    else:
222        logging.info("compile_commands element has no command and arguments")
223        return None
224
225    try:
226        obj_index = cmd_parts.index("-o") + 1
227    except ValueError:
228        # This could happen if there are non-clang commands in compile_commands.json
229        logging.info("Command has no -o option: %s", " ".join(cmd_parts))
230        return None
231    obj_rel_path = cmd_parts[obj_index]
232    # TODO(mtrofin): is the obj_base_dir correct for thinlto index bc files?
233    return TrainingIRExtractor(
234        obj_relative_path=obj_rel_path,
235        output_base_dir=output_dir,
236        obj_base_dir=obj_base_dir,
237    )
238
239
240def load_from_compile_commands(
241    json_array: List[Dict[str, str]], output_dir: str
242) -> List[TrainingIRExtractor]:
243    objs = [
244        convert_compile_command_to_objectfile(cmd, output_dir) for cmd in json_array
245    ]
246    # Filter out None, in case there were non-clang commands in the .json
247    return [obj for obj in objs if obj is not None]
248
249
250def load_from_lld_params(
251    params_array: List[str], obj_base_dir: str, output_dir: str
252) -> List[TrainingIRExtractor]:
253    """Create an ObjectFile array based on lld's parameters."""
254    # yank out -o and the output. After that, anything not starting with '-', and
255    # ending in a '.o', is an object file.
256    try:
257        minus_o_idx = params_array.index("-o")
258        del params_array[minus_o_idx : minus_o_idx + 2]
259        just_obj_paths = [
260            o for o in params_array if not o.startswith("-") and o.endswith(".o")
261        ]
262    except ValueError:
263        logging.info("This params file does not have an explicit -o option.")
264        just_obj_paths = params_array
265
266    def make_obj(obj_file: str) -> TrainingIRExtractor:
267        return TrainingIRExtractor(
268            obj_relative_path=obj_file,
269            output_base_dir=output_dir,
270            obj_base_dir=obj_base_dir,
271        )
272
273    return [make_obj(obj_file) for obj_file in just_obj_paths]
274
275
276def load_from_directory(
277    obj_base_dir: str, output_dir: str
278) -> List[TrainingIRExtractor]:
279    """Create an object file array by globbing an entire drectory.
280
281    Args:
282      obj_base_dir: The base build directory that all object files will be
283        written out as being relative to.
284      output_dir: The output directory where extracted .bc and .cmd files should
285        be placed.
286    """
287    paths = [str(p) for p in pathlib.Path(obj_base_dir).glob("**/*.o")]
288
289    def make_spec(obj_file: str):
290        return TrainingIRExtractor(
291            obj_relative_path=os.path.relpath(obj_file, start=obj_base_dir),
292            output_base_dir=output_dir,
293            obj_base_dir=obj_base_dir,
294        )
295
296    return [make_spec(path) for path in paths]
297
298
299def load_for_lld_thinlto(
300    obj_base_dir: str, output_dir: str
301) -> List[TrainingIRExtractor]:
302    # .3.import.bc is the suffix attached to post-merge-pre-opt ('postimport')
303    # IR bitcode saved by lld. It is hardcoded into lld. ThinLTO index files
304    # are also emitted next to the postimport bitcode, with the suffix
305    # .thinlto.bc instead
306    paths = [str(p) for p in pathlib.Path(obj_base_dir).glob("**/*.3.import.bc")]
307
308    def make_spec(obj_file: str):
309        return TrainingIRExtractor(
310            # Cut away .3.import.bc
311            obj_relative_path=os.path.relpath(obj_file, start=obj_base_dir)[:-12],
312            output_base_dir=output_dir,
313            obj_base_dir=obj_base_dir,
314        )
315
316    return [make_spec(path) for path in paths]
317
318
319def load_bazel_aquery(aquery_json, obj_base_dir: str, output_dir: str):
320    """Creates an object file array by looking at the JSON output of bazel aquery.
321
322    Args:
323      aquery_json: The JSON-formatted output of the bazel aquery command for
324        the target of interest. The bazel aquery JSON should be a JSON
325        serialized version of the analysis.ActionGraphContainer proto.
326        https://github.com/bazelbuild/bazel/blob/master/src/main/protobuf/analysis_v2.proto
327      obj_base_dir: The base build directory that all object files will be
328        written out as arelative to.
329      output_dir: The output directory where extracted .bc and .cmd files should
330        be placed.
331    """
332    linker_params = []
333
334    for action_info in aquery_json["actions"]:
335        if action_info["mnemonic"] != "CppLink":
336            continue
337        linker_params = action_info["arguments"]
338
339    return load_from_lld_params(linker_params, obj_base_dir, output_dir)
340
341
342def run_extraction(
343    objs: List[TrainingIRExtractor],
344    num_workers: int,
345    llvm_objcopy_path: str,
346    cmd_filter: str,
347    thinlto_build: str,
348    cmd_section_name: str,
349    bitcode_section_name: str,
350):
351    """Extracts all specified object files into the corpus directory.
352
353    Args:
354      objs: A list of TrainingIRExtractor Objects that represent the object files
355        to extract bitcode/commands from.
356      num_workers: The number of parallel processes to spawn to run the
357        extraction.
358      llvm_objcopy_path: The path to the llvm-objcopy to use for dumping sections.
359      cmd_filter: A regular expression that is used to select for compilations
360        performed with specific flags. If you want to include all compilations,
361        set this to None.
362      thinlto_build: Whether or not this is a ThinLTO build, and if so, the type.
363        Set this to None if the build was not done with ThinLTO.
364      cmd_section_name: The name of the command line section created by the
365        bitcode embedding.
366      bitcode_section_name: The name of the bitcode section created by the
367        bitcode embedding.
368    """
369    extract_artifacts = functools.partial(
370        TrainingIRExtractor.extract,
371        llvm_objcopy_path=llvm_objcopy_path,
372        cmd_filter=cmd_filter,
373        thinlto_build=thinlto_build,
374        cmd_section_name=cmd_section_name,
375        bitcode_section_name=bitcode_section_name,
376    )
377
378    with multiprocessing.Pool(num_workers) as pool:
379        relative_output_paths = pool.map(extract_artifacts, objs)
380        pool.close()
381        pool.join()
382    return relative_output_paths
383
384
385def write_corpus_manifest(
386    thinlto_build: str, relative_output_paths: List[str], output_dir: str
387):
388    """Writes a corpus_manifest.json containing all necessary information about
389    the corpus.
390
391    Args:
392      thinlto_build: Whether or not the build was done with ThinLTO and if so,
393        what kind of ThinLTO. Set this to none if the build was not performed with
394        ThinLTO.
395      relative_output_paths: The relative (to the corpus directory) output paths
396        of all the bitcode files that should be placed in the corpus manifest
397      output_dir: The corpus directory where the corpus manifest should be
398        placed.
399    """
400    # This comes first rather than later so global_command_override is at the top
401    # of the .json after being written
402    if thinlto_build == "local":
403        corpus_description = {"global_command_override": _UNSPECIFIED_OVERRIDE}
404    else:
405        corpus_description = {}
406
407    corpus_description.update(
408        {
409            "has_thinlto": thinlto_build is not None,
410            "modules": [path for path in relative_output_paths if path is not None],
411        }
412    )
413
414    with open(
415        os.path.join(output_dir, "corpus_description.json"), "w", encoding="utf-8"
416    ) as f:
417        json.dump(corpus_description, f, indent=2)
418