xref: /llvm-project/llvm/utils/mlgo-utils/mlgo/corpus/make_corpus_lib.py (revision a387bce4bcbaeb28bf4510817ce54602e2f7a21d)
1# Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
2# See https://llvm.org/LICENSE.txt for license information.
3# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
4"""Library functions for making a corpus from arbitrary bitcode."""
5
6import pathlib
7import os
8import shutil
9import json
10
11from typing import List, Optional
12
13BITCODE_EXTENSION = ".bc"
14
15
16def load_bitcode_from_directory(bitcode_base_dir: str) -> List[str]:
17    """Finds bitcode files to extract from a given directory.
18
19    Args:
20      bitcode_base_dir: The base directory where the bitcode to be copied
21        is from.
22      output_dir: The directory to place the bitcode in.
23
24    Returns an array of paths representing the relative path to the bitcode
25    file from the base direcotry.
26    """
27    paths = [
28        str(p)[: -len(BITCODE_EXTENSION)]
29        for p in pathlib.Path(bitcode_base_dir).glob("**/*" + BITCODE_EXTENSION)
30    ]
31
32    return [os.path.relpath(full_path, start=bitcode_base_dir) for full_path in paths]
33
34
35def copy_bitcode(
36    relative_paths: List[str], bitcode_base_dir: str, output_dir: str
37) -> None:
38    """Copies bitcode files from the base directory to the output directory.
39
40    Args:
41      relative_paths: An array of relative paths to bitcode files that are copied
42        over to the output directory, preserving relative location.
43      bitcode_base_dir: The base directory where the bitcode is located.
44      output_dir: The output directory to place the bitcode in.
45    """
46    for relative_path in relative_paths:
47        base_path = os.path.join(bitcode_base_dir, relative_path + BITCODE_EXTENSION)
48        destination_path = os.path.join(output_dir, relative_path + BITCODE_EXTENSION)
49        os.makedirs(os.path.dirname(destination_path), exist_ok=True)
50        shutil.copy(base_path, destination_path)
51
52
53def write_corpus_manifest(
54    relative_output_paths: List[str],
55    output_dir: str,
56    default_args: Optional[List[str]] = None,
57) -> None:
58    """Creates a corpus manifest describing the bitcode that has been found.
59
60    Args:
61      relative_output_paths: A list of paths to each bitcode file relative to the
62        output directory.
63      outout_dir: The output directory where the corpus is being created.
64      default_args: An array of compiler flags that should be used to compile
65        the bitcode when using further downstream tooling."""
66    if default_args is None:
67        default_args = []
68    corpus_description = {
69        "global_command_override": default_args,
70        "has_thinlto": False,
71        "modules": [path for path in relative_output_paths if path is not None],
72    }
73
74    with open(
75        os.path.join(output_dir, "corpus_description.json"), "w", encoding="utf-8"
76    ) as description_file:
77        json.dump(corpus_description, description_file, indent=2)
78