Skip to content

Commit

Permalink
Add ability to extract raw bitcode
Browse files Browse the repository at this point in the history
This patch adds in the ability to extract raw bitcode from a directory.
This is motivated primarily by extracting LLVM IR bitcode from rust
projects as it is quite easy to emit LLVM bitcode for each target but
bitcode embedded in object files is much less trivial and neither
solution currently provides command line argumens so they need to be
reconstructed regardless.
  • Loading branch information
boomanaiden154 committed Jun 28, 2023
1 parent 22c7154 commit 8f8454a
Show file tree
Hide file tree
Showing 3 changed files with 78 additions and 12 deletions.
14 changes: 11 additions & 3 deletions compiler_opt/tools/extract_ir.py
Original file line number Diff line number Diff line change
Expand Up @@ -44,8 +44,8 @@
'Input file or directory - either compile_commands.json, a linker parameter'
'list, or a path to a directory containing object files.')
flags.DEFINE_enum(
'input_type', 'json', ['json', 'params', 'directory'],
'Input file type - json, params, or directory. params latter refers to lld'
'input_type', 'json', ['json', 'params', 'directory', 'bitcode_directory'],
'Input type - json, params, or (bitcode) directory. params refers to lld'
'params.')
flags.DEFINE_string('output_dir', None, 'Output directory')
flags.DEFINE_integer(
Expand All @@ -69,6 +69,10 @@
'-mllvm -lto-embed-bitcode=post-merge-pre-opt passed in the distributed '
'case, or -Wl,--save-temps=import and -Wl,--thinlto-emit-index-files '
'passed in the local case.')
flags.DEFINE_string(
'default_command_line_flags', '',
'The default command line flags if no command line data is associated with '
'the inputs, i.e., in the bitcode_directory input')
flags.DEFINE_string(
'cmd_section_name', '.llvmcmd',
'The section name passed to llvm-objcopy. For ELF object files, the '
Expand Down Expand Up @@ -118,12 +122,16 @@ def main(argv):
'ml-compiler-opt understands. If your build system provides a'
'structured compilation database, use that instead')
objs = extract_ir_lib.load_from_directory(FLAGS.input, FLAGS.output_dir)
elif FLAGS.input_type == 'bitcode_directory':
objs = extract_ir_lib.load_bitcode_from_directory(FLAGS.input,
FLAGS.output_dir)
else:
logging.error('Unknown input type: %s', FLAGS.input_type)

relative_output_paths = extract_ir_lib.run_extraction(
objs, FLAGS.num_workers, FLAGS.llvm_objcopy_path, FLAGS.cmd_filter,
FLAGS.thinlto_build, FLAGS.cmd_section_name, FLAGS.bitcode_section_name)
FLAGS.thinlto_build, FLAGS.cmd_section_name, FLAGS.bitcode_section_name,
FLAGS.default_command_line_flags, FLAGS.input_type)

extract_ir_lib.write_corpus_manifest(FLAGS.thinlto_build,
relative_output_paths, FLAGS.output_dir)
Expand Down
60 changes: 51 additions & 9 deletions compiler_opt/tools/extract_ir_lib.py
Original file line number Diff line number Diff line change
Expand Up @@ -166,6 +166,23 @@ def _extract_clang_artifacts(self, llvm_objcopy_path: str, cmd_filter: str,
(not is_thinlto or os.path.exists(self.thinlto_index_file())))
return self.relative_output_path()

def _extract_bitcode_artifacts(self, default_command_line) -> Optional[str]:
"""Extracts a raw bitcode file from a directory
Args:
default_command_line: The command line flags to use when writing
per-bitcode-file command lines as they don't have a command line
associated with them by default.
"""
if not os.path.exists(self.input_obj()):
logging.info('%s does not exist.', self.input_obj())
os.makedirs(self.dest_dir(), exist_ok=True)
shutil.copy(self.input_obj(), self.bc_file())
with open(self.cmd_file(), 'w', encoding='utf-8') as command_file:
command_file.write(default_command_line)
assert (os.path.exists(self.cmd_file()) and os.path.exists(self.bc_file()))
return self.relative_output_path()

def _extract_lld_artifacts(self) -> Optional[str]:
"""Extract the .bc file with ThinLTO index from an lld ThinLTO invocation.
"""
Expand All @@ -190,15 +207,20 @@ def extract(self,
cmd_filter: Optional[str] = None,
thinlto_build: Optional[str] = None,
cmd_section_name: Optional[str] = '.llvmcmd',
bitcode_section_name: Optional[str] = '.llvmbc') -> Optional[str]:
bitcode_section_name: Optional[str] = '.llvmbc',
default_command_line_flags='',
input_type='') -> Optional[str]:
if thinlto_build == 'local':
return self._extract_lld_artifacts()
return self._extract_clang_artifacts(
llvm_objcopy_path=llvm_objcopy_path,
cmd_filter=cmd_filter,
is_thinlto=thinlto_build == 'distributed',
cmd_section_name=cmd_section_name,
bitcode_section_name=bitcode_section_name)
if input_type == 'bitcode_directory':
return self._extract_bitcode_artifacts(default_command_line_flags)
else:
return self._extract_clang_artifacts(
llvm_objcopy_path=llvm_objcopy_path,
cmd_filter=cmd_filter,
is_thinlto=thinlto_build == 'distributed',
cmd_section_name=cmd_section_name,
bitcode_section_name=bitcode_section_name)


def convert_compile_command_to_objectfile(
Expand Down Expand Up @@ -276,6 +298,19 @@ def make_spec(obj_file: str):
return [make_spec(path) for path in paths]


def load_bitcode_from_directory(bitcode_base_dir: str,
output_dir: str) -> List[TrainingIRExtractor]:
paths = [str(p) for p in pathlib.Path(bitcode_base_dir).glob('**/*.bc')]

def make_spec(bc_file: str):
return TrainingIRExtractor(
obj_relative_path=os.path.relpath(bc_file, start=bitcode_base_dir),
output_base_dir=output_dir,
obj_base_dir=bitcode_base_dir)

return [make_spec(path) for path in paths]


def load_for_lld_thinlto(obj_base_dir: str,
output_dir: str) -> List[TrainingIRExtractor]:
# .3.import.bc is the suffix attached to post-merge-pre-opt ('postimport')
Expand All @@ -296,7 +331,8 @@ def make_spec(obj_file: str):

def run_extraction(objs: List[TrainingIRExtractor], num_workers: int,
llvm_objcopy_path: str, cmd_filter: str, thinlto_build: str,
cmd_section_name: str, bitcode_section_name: str):
cmd_section_name: str, bitcode_section_name: str,
default_command_line_flags: str, input_type: str):
"""Extracts all specified object files into the corpus directory.
Args:
Expand All @@ -314,14 +350,20 @@ def run_extraction(objs: List[TrainingIRExtractor], num_workers: int,
bitcode embedding.
bitcode_section_name: The name of the bitcode section created by the
bitcode embedding.
default_command_line_flags: The command line flags to use if no command
line is present from within the build (e.g., extracting from a directory
containing only bitcode).
input_type: The type of input that IR is being extracted from.
"""
extract_artifacts = functools.partial(
TrainingIRExtractor.extract,
llvm_objcopy_path=llvm_objcopy_path,
cmd_filter=cmd_filter,
thinlto_build=thinlto_build,
cmd_section_name=cmd_section_name,
bitcode_section_name=bitcode_section_name)
bitcode_section_name=bitcode_section_name,
default_command_line_flags=default_command_line_flags,
input_type=input_type)

with multiprocessing.Pool(num_workers) as pool:
relative_output_paths = pool.map(extract_artifacts, objs)
Expand Down
16 changes: 16 additions & 0 deletions compiler_opt/tools/extract_ir_test.py
Original file line number Diff line number Diff line change
Expand Up @@ -135,6 +135,22 @@ def test_load_from_directory(self):
self.assertEqual(obj._obj_base_dir, tempdir.full_path)
self.assertEqual(obj._output_base_dir, outdir.full_path)

def test_load_bitcode_from_directory(self):
tempdir = self.create_tempdir()
subdir = tempdir.mkdir(dir_path='subdir')
subdir.create_file(file_path='test1.bc')
subdir.create_file(file_path='test2.bc')
outdir = self.create_tempdir()
bc_files = extract_ir_lib.load_bitcode_from_directory(
tempdir.full_path, outdir.full_path)
self.assertLen(bc_files, 2)
for index, bc_file in enumerate(
sorted(bc_files, key=lambda x: x._obj_relative_path)):
self.assertEqual(bc_file._obj_relative_path,
f'subdir/test{index +1:d}.bc')
self.assertEqual(bc_file._obj_base_dir, tempdir.full_path)
self.assertEqual(bc_file._output_base_dir, outdir.full_path)

def test_lld_thinlto_discovery(self):
tempdir = self.create_tempdir()
tempdir.create_file(file_path='1.3.import.bc')
Expand Down

0 comments on commit 8f8454a

Please sign in to comment.