xref: /llvm-project/llvm/utils/mlgo-utils/mlgo/corpus/make_corpus.py (revision 120e0623773dc9c43f393d43be0641c7d7ad26f2)
1# Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
2# See https://llvm.org/LICENSE.txt for license information.
3# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
4"""Tool for making a corpus from arbitrary bitcode.
5
6To create a corpus from a set of bitcode files in an input directory, run
7the following command:
8
9PYTHONPATH=$PYTHONPATH:. python3 ./compiler_opt/tools/make_corpus.py \
10  --input_dir=<path to input directory> \
11  --output_dir=<path to output directory> \
12  --default_args="<list of space separated flags>"
13"""
14
15import argparse
16import logging
17
18from mlgo.corpus import make_corpus_lib
19
20
21def parse_args_and_run():
22    parser = argparse.ArgumentParser(
23        description="A tool for making a corpus from arbitrary bitcode"
24    )
25    parser.add_argument("--input_dir", type=str, help="The input directory.")
26    parser.add_argument("--output_dir", type=str, help="The output directory.")
27    parser.add_argument(
28        "--default_args",
29        type=str,
30        help="The compiler flags to compile with when using downstream tooling.",
31        default="",
32        nargs="?",
33    )
34    args = parser.parse_args()
35    main(args)
36
37
38def main(args):
39    logging.warning(
40        "Using this tool does not guarantee that the bitcode is taken at "
41        "the correct stage for consumption during model training. Make "
42        "sure to validate assumptions about where the bitcode is coming "
43        "from before using it in production."
44    )
45    relative_paths = make_corpus_lib.load_bitcode_from_directory(args.input_dir)
46    make_corpus_lib.copy_bitcode(relative_paths, args.input_dir, args.output_dir)
47    make_corpus_lib.write_corpus_manifest(
48        relative_paths, args.output_dir, args.default_args.split()
49    )
50
51
52if __name__ == "__main__":
53    parse_args_and_run()
54