1# Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 2# See https://llvm.org/LICENSE.txt for license information. 3# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 4"""Tool for making a corpus from arbitrary bitcode. 5 6To create a corpus from a set of bitcode files in an input directory, run 7the following command: 8 9PYTHONPATH=$PYTHONPATH:. python3 ./compiler_opt/tools/make_corpus.py \ 10 --input_dir=<path to input directory> \ 11 --output_dir=<path to output directory> \ 12 --default_args="<list of space separated flags>" 13""" 14 15import argparse 16import logging 17 18from mlgo.corpus import make_corpus_lib 19 20 21def parse_args_and_run(): 22 parser = argparse.ArgumentParser( 23 description="A tool for making a corpus from arbitrary bitcode" 24 ) 25 parser.add_argument("--input_dir", type=str, help="The input directory.") 26 parser.add_argument("--output_dir", type=str, help="The output directory.") 27 parser.add_argument( 28 "--default_args", 29 type=str, 30 help="The compiler flags to compile with when using downstream tooling.", 31 default="", 32 nargs="?", 33 ) 34 args = parser.parse_args() 35 main(args) 36 37 38def main(args): 39 logging.warning( 40 "Using this tool does not guarantee that the bitcode is taken at " 41 "the correct stage for consumption during model training. Make " 42 "sure to validate assumptions about where the bitcode is coming " 43 "from before using it in production." 44 ) 45 relative_paths = make_corpus_lib.load_bitcode_from_directory(args.input_dir) 46 make_corpus_lib.copy_bitcode(relative_paths, args.input_dir, args.output_dir) 47 make_corpus_lib.write_corpus_manifest( 48 relative_paths, args.output_dir, args.default_args.split() 49 ) 50 51 52if __name__ == "__main__": 53 parse_args_and_run() 54