From 8f8454ad6072af4a498d0554d6ca165fbffc0923 Mon Sep 17 00:00:00 2001 From: Aiden Grossman Date: Fri, 23 Jun 2023 22:41:16 +0000 Subject: [PATCH] Add ability to extract raw bitcode This patch adds in the ability to extract raw bitcode from a directory. This is motivated primarily by extracting LLVM IR bitcode from rust projects as it is quite easy to emit LLVM bitcode for each target but bitcode embedded in object files is much less trivial and neither solution currently provides command line argumens so they need to be reconstructed regardless. --- compiler_opt/tools/extract_ir.py | 14 +++++-- compiler_opt/tools/extract_ir_lib.py | 60 +++++++++++++++++++++++---- compiler_opt/tools/extract_ir_test.py | 16 +++++++ 3 files changed, 78 insertions(+), 12 deletions(-) diff --git a/compiler_opt/tools/extract_ir.py b/compiler_opt/tools/extract_ir.py index e55913ab..f9c02993 100644 --- a/compiler_opt/tools/extract_ir.py +++ b/compiler_opt/tools/extract_ir.py @@ -44,8 +44,8 @@ 'Input file or directory - either compile_commands.json, a linker parameter' 'list, or a path to a directory containing object files.') flags.DEFINE_enum( - 'input_type', 'json', ['json', 'params', 'directory'], - 'Input file type - json, params, or directory. params latter refers to lld' + 'input_type', 'json', ['json', 'params', 'directory', 'bitcode_directory'], + 'Input type - json, params, or (bitcode) directory. params refers to lld' 'params.') flags.DEFINE_string('output_dir', None, 'Output directory') flags.DEFINE_integer( @@ -69,6 +69,10 @@ '-mllvm -lto-embed-bitcode=post-merge-pre-opt passed in the distributed ' 'case, or -Wl,--save-temps=import and -Wl,--thinlto-emit-index-files ' 'passed in the local case.') +flags.DEFINE_string( + 'default_command_line_flags', '', + 'The default command line flags if no command line data is associated with ' + 'the inputs, i.e., in the bitcode_directory input') flags.DEFINE_string( 'cmd_section_name', '.llvmcmd', 'The section name passed to llvm-objcopy. For ELF object files, the ' @@ -118,12 +122,16 @@ def main(argv): 'ml-compiler-opt understands. If your build system provides a' 'structured compilation database, use that instead') objs = extract_ir_lib.load_from_directory(FLAGS.input, FLAGS.output_dir) + elif FLAGS.input_type == 'bitcode_directory': + objs = extract_ir_lib.load_bitcode_from_directory(FLAGS.input, + FLAGS.output_dir) else: logging.error('Unknown input type: %s', FLAGS.input_type) relative_output_paths = extract_ir_lib.run_extraction( objs, FLAGS.num_workers, FLAGS.llvm_objcopy_path, FLAGS.cmd_filter, - FLAGS.thinlto_build, FLAGS.cmd_section_name, FLAGS.bitcode_section_name) + FLAGS.thinlto_build, FLAGS.cmd_section_name, FLAGS.bitcode_section_name, + FLAGS.default_command_line_flags, FLAGS.input_type) extract_ir_lib.write_corpus_manifest(FLAGS.thinlto_build, relative_output_paths, FLAGS.output_dir) diff --git a/compiler_opt/tools/extract_ir_lib.py b/compiler_opt/tools/extract_ir_lib.py index 9be0ad7c..1ffcd4d0 100644 --- a/compiler_opt/tools/extract_ir_lib.py +++ b/compiler_opt/tools/extract_ir_lib.py @@ -166,6 +166,23 @@ def _extract_clang_artifacts(self, llvm_objcopy_path: str, cmd_filter: str, (not is_thinlto or os.path.exists(self.thinlto_index_file()))) return self.relative_output_path() + def _extract_bitcode_artifacts(self, default_command_line) -> Optional[str]: + """Extracts a raw bitcode file from a directory + + Args: + default_command_line: The command line flags to use when writing + per-bitcode-file command lines as they don't have a command line + associated with them by default. + """ + if not os.path.exists(self.input_obj()): + logging.info('%s does not exist.', self.input_obj()) + os.makedirs(self.dest_dir(), exist_ok=True) + shutil.copy(self.input_obj(), self.bc_file()) + with open(self.cmd_file(), 'w', encoding='utf-8') as command_file: + command_file.write(default_command_line) + assert (os.path.exists(self.cmd_file()) and os.path.exists(self.bc_file())) + return self.relative_output_path() + def _extract_lld_artifacts(self) -> Optional[str]: """Extract the .bc file with ThinLTO index from an lld ThinLTO invocation. """ @@ -190,15 +207,20 @@ def extract(self, cmd_filter: Optional[str] = None, thinlto_build: Optional[str] = None, cmd_section_name: Optional[str] = '.llvmcmd', - bitcode_section_name: Optional[str] = '.llvmbc') -> Optional[str]: + bitcode_section_name: Optional[str] = '.llvmbc', + default_command_line_flags='', + input_type='') -> Optional[str]: if thinlto_build == 'local': return self._extract_lld_artifacts() - return self._extract_clang_artifacts( - llvm_objcopy_path=llvm_objcopy_path, - cmd_filter=cmd_filter, - is_thinlto=thinlto_build == 'distributed', - cmd_section_name=cmd_section_name, - bitcode_section_name=bitcode_section_name) + if input_type == 'bitcode_directory': + return self._extract_bitcode_artifacts(default_command_line_flags) + else: + return self._extract_clang_artifacts( + llvm_objcopy_path=llvm_objcopy_path, + cmd_filter=cmd_filter, + is_thinlto=thinlto_build == 'distributed', + cmd_section_name=cmd_section_name, + bitcode_section_name=bitcode_section_name) def convert_compile_command_to_objectfile( @@ -276,6 +298,19 @@ def make_spec(obj_file: str): return [make_spec(path) for path in paths] +def load_bitcode_from_directory(bitcode_base_dir: str, + output_dir: str) -> List[TrainingIRExtractor]: + paths = [str(p) for p in pathlib.Path(bitcode_base_dir).glob('**/*.bc')] + + def make_spec(bc_file: str): + return TrainingIRExtractor( + obj_relative_path=os.path.relpath(bc_file, start=bitcode_base_dir), + output_base_dir=output_dir, + obj_base_dir=bitcode_base_dir) + + return [make_spec(path) for path in paths] + + def load_for_lld_thinlto(obj_base_dir: str, output_dir: str) -> List[TrainingIRExtractor]: # .3.import.bc is the suffix attached to post-merge-pre-opt ('postimport') @@ -296,7 +331,8 @@ def make_spec(obj_file: str): def run_extraction(objs: List[TrainingIRExtractor], num_workers: int, llvm_objcopy_path: str, cmd_filter: str, thinlto_build: str, - cmd_section_name: str, bitcode_section_name: str): + cmd_section_name: str, bitcode_section_name: str, + default_command_line_flags: str, input_type: str): """Extracts all specified object files into the corpus directory. Args: @@ -314,6 +350,10 @@ def run_extraction(objs: List[TrainingIRExtractor], num_workers: int, bitcode embedding. bitcode_section_name: The name of the bitcode section created by the bitcode embedding. + default_command_line_flags: The command line flags to use if no command + line is present from within the build (e.g., extracting from a directory + containing only bitcode). + input_type: The type of input that IR is being extracted from. """ extract_artifacts = functools.partial( TrainingIRExtractor.extract, @@ -321,7 +361,9 @@ def run_extraction(objs: List[TrainingIRExtractor], num_workers: int, cmd_filter=cmd_filter, thinlto_build=thinlto_build, cmd_section_name=cmd_section_name, - bitcode_section_name=bitcode_section_name) + bitcode_section_name=bitcode_section_name, + default_command_line_flags=default_command_line_flags, + input_type=input_type) with multiprocessing.Pool(num_workers) as pool: relative_output_paths = pool.map(extract_artifacts, objs) diff --git a/compiler_opt/tools/extract_ir_test.py b/compiler_opt/tools/extract_ir_test.py index b7004948..cc8c1169 100644 --- a/compiler_opt/tools/extract_ir_test.py +++ b/compiler_opt/tools/extract_ir_test.py @@ -135,6 +135,22 @@ def test_load_from_directory(self): self.assertEqual(obj._obj_base_dir, tempdir.full_path) self.assertEqual(obj._output_base_dir, outdir.full_path) + def test_load_bitcode_from_directory(self): + tempdir = self.create_tempdir() + subdir = tempdir.mkdir(dir_path='subdir') + subdir.create_file(file_path='test1.bc') + subdir.create_file(file_path='test2.bc') + outdir = self.create_tempdir() + bc_files = extract_ir_lib.load_bitcode_from_directory( + tempdir.full_path, outdir.full_path) + self.assertLen(bc_files, 2) + for index, bc_file in enumerate( + sorted(bc_files, key=lambda x: x._obj_relative_path)): + self.assertEqual(bc_file._obj_relative_path, + f'subdir/test{index +1:d}.bc') + self.assertEqual(bc_file._obj_base_dir, tempdir.full_path) + self.assertEqual(bc_file._output_base_dir, outdir.full_path) + def test_lld_thinlto_discovery(self): tempdir = self.create_tempdir() tempdir.create_file(file_path='1.3.import.bc')