xref: /llvm-project/llvm/utils/mlgo-utils/mlgo/corpus/extract_ir.py (revision 99ea357f7b5e7e01e42b8d68dd211dc304b3115b)
1# Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
2# See https://llvm.org/LICENSE.txt for license information.
3# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
4"""Extract IR for training.
5
6Extract IR for training, either from a compile_commands.json file produced by
7cmake, or a linker parameter list file.
8
9Only run with
10'python compiler_opt/tools/extract_ir.py ...'
11
12The compilation is assumed to have been performed with clang, using
13-fembed-bitcode=all passed to cc1 (i.e. pass clang -Xclang=-fembed-bitcode=all)
14
15In a distributed ThinLTO case, the compilation is assumed to have been performed
16specifying -mllvm -lto-embed-bitcode=post-merge-pre-opt.
17
18In a local ThinLTO case, the compilation is assumedto have been performed
19specifying -Wl,--save-temps=import -Wl,--thinlto-emit-index-files
20
21To change the logging verbosity, set the --verbosity flag to the desired level.
22Setting it to a specific level will enable all messages at that level and
23higher. Exact values can be found by invoking the script with --help.
24"""
25
26import argparse
27import json
28import logging
29
30from mlgo.corpus import extract_ir_lib
31
32
33def parse_args_and_run():
34    parser = argparse.ArgumentParser(
35        description="A tool for making a corpus from build artifacts"
36    )
37    parser.add_argument(
38        "--input",
39        type=str,
40        help="Input file or directory - either compile_commands.json, a linker "
41        "parameter list, or a path to a directory containing object files.",
42    )
43    parser.add_argument(
44        "--input_type",
45        type=str,
46        help="Input file type - JSON, LLD params, directory, or bazel aquery.",
47        choices=["json", "params", "directory", "bazel_aquery"],
48        default="json",
49        nargs="?",
50    )
51    parser.add_argument("--output_dir", type=str, help="Output directory")
52    parser.add_argument(
53        "--num_workers",
54        type=int,
55        help="Number of parallel works for objcopy. `None` for maximum available.",
56        default=None,
57        nargs="?",
58    )
59    parser.add_argument(
60        "--llvm_objcopy_path",
61        type=str,
62        help="Path to llvm-objcopy",
63        default="llvm-objcopy",
64        nargs="?",
65    )
66    parser.add_argument(
67        "--obj_base_dir",
68        type=str,
69        help="Base directory for object files. Defaults to current working dir.",
70        default="",
71        nargs="?",
72    )
73    parser.add_argument(
74        "--cmd_filter",
75        type=str,
76        help="Include only those modules with a command line matching this regular "
77        "expression. Set it to None to not perform any filtering. Note that the "
78        "regular expression is applied independently for each separate command line "
79        "option. For example, ^-Oz$ will match Oz built binaries. This does not work "
80        "with thinlto_build=lld.",
81        default=None,
82        nargs="?",
83    )
84    parser.add_argument(
85        "--thinlto_build",
86        type=str,
87        help="Set if the build was performed with either 'distributed' or 'local' "
88        "ThinLTO. This ensures the thinlto.bc files are also copied. The build is "
89        "assumed to have had -mllvm -lto-embed-bitcode=post-merge-pre-opt passed in "
90        "the distributed case or -Wl,--save-temps=import and "
91        "-Wl,--thinlto-emit-index-files passed in the local case",
92        choices=["distributed", "local"],
93        default=None,
94        nargs="?",
95    )
96    parser.add_argument(
97        "--cmd_section_name",
98        type=str,
99        help="The section name passed to llvm-objcopy. For ELF object files, the "
100        "default .llvmcmd is correct. For Mach-O object files, one should use "
101        "something like __LLVM,__cmdline",
102        default=".llvmcmd",
103        nargs="?",
104    )
105    parser.add_argument(
106        "--bitcode_section_name",
107        type=str,
108        help="The section name passed to llvm-objcopy. For ELF object files, the "
109        "default .llvmbc is correct. For Mach-O object files, one should use "
110        "__LLVM,__bitcode",
111        default=".llvmbc",
112        nargs="?",
113    )
114    # TODO(#107898): Refactor this into a common location.
115    parser.add_argument(
116        "--verbosity",
117        type=str,
118        help="The verbosity level to use for logging",
119        default="INFO",
120        nargs="?",
121        choices=["DEBUG", "INFO", "WARNING", "ERROR"],
122    )
123    args = parser.parse_args()
124    main(args)
125
126
127def main(args):
128    logging.basicConfig(level=args.verbosity)
129
130    objs = []
131    if args.input is not None and args.thinlto_build == "local":
132        raise ValueError("--thinlto_build=local cannot be run with --input")
133    if args.input is None:
134        if args.thinlto_build != "local":
135            raise ValueError("--input or --thinlto_build=local must be provided")
136        objs = extract_ir_lib.load_for_lld_thinlto(args.obj_base_dir, args.output_dir)
137    elif args.input_type == "json":
138        with open(args.input, encoding="utf-8") as f:
139            objs = extract_ir_lib.load_from_compile_commands(
140                json.load(f), args.output_dir
141            )
142    elif args.input_type == "params":
143        if not args.obj_base_dir:
144            logging.info(
145                "-obj_base_dir is unspecified, assuming current directory. "
146                "If no objects are found, use this option to specify the root "
147                "directory for the object file paths in the input file."
148            )
149        with open(args.input, encoding="utf-8") as f:
150            objs = extract_ir_lib.load_from_lld_params(
151                [l.strip() for l in f.readlines()], args.obj_base_dir, args.output_dir
152            )
153    elif args.input_type == "directory":
154        logging.warning(
155            "Using the directory input is only recommended if the build system "
156            "your project uses does not support any structured output that "
157            "ml-compiler-opt understands. If your build system provides a "
158            "structured compilation database, use that instead"
159        )
160        objs = extract_ir_lib.load_from_directory(args.input, args.output_dir)
161    elif args.input_type == "bazel_aquery":
162        with open(args.input, encoding="utf-8") as aquery_json_handle:
163            objs = extract_ir_lib.load_bazel_aquery(
164                json.load(aquery_json_handle), args.obj_base_dir, args.output_dir
165            )
166    else:
167        logging.error("Unknown input type: %s", args.input_type)
168
169    relative_output_paths = extract_ir_lib.run_extraction(
170        objs,
171        args.num_workers,
172        args.llvm_objcopy_path,
173        args.cmd_filter,
174        args.thinlto_build,
175        args.cmd_section_name,
176        args.bitcode_section_name,
177    )
178
179    extract_ir_lib.write_corpus_manifest(
180        args.thinlto_build, relative_output_paths, args.output_dir
181    )
182
183    logging.info(
184        "Converted %d files out of %d",
185        len(objs) - relative_output_paths.count(None),
186        len(objs),
187    )
188
189
190if __name__ == "__main__":
191    parse_args_and_run()
192