Skip to content

Commit

Permalink
Merge pull request #45 from lifting-bits/port_mcsema_lifter
Browse files Browse the repository at this point in the history
New visitor-style machine code lifter
  • Loading branch information
Peter Goodman authored Sep 28, 2020
2 parents 251ce03 + 475a60b commit 87adbd6
Show file tree
Hide file tree
Showing 17 changed files with 1,580 additions and 560 deletions.
6 changes: 5 additions & 1 deletion CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -78,7 +78,10 @@ add_library(${ANVILL} STATIC

include/anvill/Lift.h
lib/Lift.cpp


include/anvill/MCToIRLifter.h
lib/MCToIRLifter.cpp

include/anvill/Optimize.h
lib/Optimize.cpp

Expand Down Expand Up @@ -120,6 +123,7 @@ target_link_libraries(${SPECIFY_BITCODE} PRIVATE ${ANVILL})
set(ANVILL_PYTHON_SOURCES
setup.py
python/anvill/__init__.py
python/anvill/__main__.py
python/anvill/arch.py
python/anvill/binja.py
python/anvill/dwarf.py
Expand Down
72 changes: 40 additions & 32 deletions JSON.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -56,12 +56,14 @@ static void SetVersion(void) {

# include <gflags/gflags.h>
# include <glog/logging.h>

// clang-format off
# include <remill/BC/Compat/CTypes.h>
# include <llvm/IR/LLVMContext.h>
# include <llvm/IR/Module.h>
# include <llvm/Support/JSON.h>
# include <llvm/Support/MemoryBuffer.h>

// clang-format on

# include <remill/Arch/Arch.h>
Expand Down Expand Up @@ -589,69 +591,75 @@ int main(int argc, char *argv[]) {
os_str = maybe_os->str();
}

const auto arch_name = remill::GetArchName(arch_str);
const auto os_name = remill::GetOSName(os_str);

llvm::LLVMContext context;
const auto arch = remill::Arch::Build(&context, os_name, arch_name);
auto arch = remill::Arch::Build(&context, remill::GetOSName(os_str),
remill::GetArchName(arch_str));
if (!arch) {
return EXIT_FAILURE;
}

// NOTE(pag): This needs to come first, unfortunately, as the
// only way for `arch` to learn about the organization
// of the state structure and its named registers is
// by analyzing a module, and this is done in `PrepareModule`,
// which is called by `LoadArchSemantics`.
std::unique_ptr<llvm::Module> semantics(remill::LoadArchSemantics(arch));
remill::IntrinsicTable intrinsics(semantics);
auto semantics = remill::LoadArchSemantics(arch);

anvill::Program program;
if (!ParseSpec(arch.get(), context, program, spec)) {
return EXIT_FAILURE;
}

std::unordered_map<uint64_t, llvm::GlobalVariable *> global_vars;
std::unordered_map<uint64_t, llvm::Function *> lift_targets;
anvill::LiftCodeIntoModule(arch.get(), program, *semantics);

auto trace_manager = anvill::TraceManager::Create(*semantics, program);
RecoverMemoryAccesses(program, *semantics);

remill::InstructionLifter inst_lifter(arch, intrinsics);
remill::TraceLifter trace_lifter(inst_lifter, *trace_manager);
// Create an output module `dest_module`
auto dest_module = std::make_unique<llvm::Module>(FLAGS_spec, context);
dest_module->setTargetTriple(semantics->getTargetTriple());
dest_module->setDataLayout(semantics->getDataLayout());

// Clone functions from `semantics` to `dest_module`.
// Necessary global variables will be cloned too.
program.ForEachFunction([&](const anvill::FunctionDecl *decl) {
auto byte = program.FindByte(decl->address);
if (byte.IsExecutable()) {
trace_lifter.Lift(byte.Address());
}
std::stringstream ss;
ss << "sub_" << std::hex << decl->address << std::dec;
auto src = semantics->getFunction(ss.str());
auto dst = decl->DeclareInModule(ss.str(), *dest_module);
remill::CloneFunctionInto(src, dst);
return true;
});

// Optimize the module, but with a particular focus on only the functions
// that we actually lifted.
anvill::OptimizeModule(arch.get(), program, *semantics);

program.ForEachVariable([&](const anvill::GlobalVarDecl *decl) {
// Apply symbol names to functions if we have the names.
program.ForEachNamedAddress([&](uint64_t addr, const std::string &name,
const anvill::FunctionDecl *fdecl,
const anvill::GlobalVarDecl *vdecl) {
std::stringstream ss;
ss << "data_" << std::hex << decl->address;
global_vars[decl->address] = decl->DeclareInModule(ss.str(), *semantics);
llvm::Value *gval = nullptr;
if (vdecl) {
ss << "data_" << std::hex << vdecl->address << std::dec;
gval = dest_module->getGlobalVariable(ss.str());
} else if (fdecl) {
ss << "sub_" << std::hex << fdecl->address << std::dec;
gval = dest_module->getFunction(ss.str());
} else {
return true;
}

if (gval) {
gval->setName(name);
}

return true;
});

anvill::RecoverMemoryAccesses(program, *semantics);

// anvill::OptimizeModule(arch.get(), program, dest_module);
anvill::OptimizeModule(arch.get(), program, *dest_module);

int ret = EXIT_SUCCESS;

if (!FLAGS_ir_out.empty()) {
if (!remill::StoreModuleIRToFile(semantics.get(), FLAGS_ir_out, true)) {
if (!remill::StoreModuleIRToFile(dest_module.get(), FLAGS_ir_out, true)) {
LOG(ERROR) << "Could not save LLVM IR to " << FLAGS_ir_out;
ret = EXIT_FAILURE;
}
}
if (!FLAGS_bc_out.empty()) {
if (!remill::StoreModuleToFile(semantics.get(), FLAGS_bc_out, true)) {
if (!remill::StoreModuleToFile(dest_module.get(), FLAGS_bc_out, true)) {
LOG(ERROR) << "Could not save LLVM bitcode to " << FLAGS_bc_out;
ret = EXIT_FAILURE;
}
Expand Down
29 changes: 0 additions & 29 deletions examples/lift.py

This file was deleted.

62 changes: 62 additions & 0 deletions include/anvill/LegacyLift.h
Original file line number Diff line number Diff line change
@@ -0,0 +1,62 @@
/*
* Copyright (c) 2020 Trail of Bits, Inc.
*
* This program is free software: you can redistribute it and/or modify
* it under the terms of the GNU Affero General Public License as
* published by the Free Software Foundation, either version 3 of the
* License, or (at your option) any later version.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU Affero General Public License for more details.
*
* You should have received a copy of the GNU Affero General Public License
* along with this program. If not, see <http://www.gnu.org/licenses/>.
*/

#pragma once

#include <remill/BC/Lifter.h>

#include <memory>

namespace llvm {
class Module;
} // namespace llvm

namespace remill {
class IntrinsicTable;
} // namespace remill

namespace anvill {

class Program;
struct ValueDecl;

// Manages lifting of machine code functions from the input
// program.
class TraceManager : public remill::TraceManager {
public:
virtual ~TraceManager(void);

static std::unique_ptr<TraceManager> Create(llvm::Module &semantics_module,
const Program &);
};

// Produce one or more instructions in `in_block` to load and return
// the lifted value associated with `decl`.
llvm::Value *LoadLiftedValue(const ValueDecl &decl,
const remill::IntrinsicTable &intrinsics,
llvm::BasicBlock *in_block, llvm::Value *state_ptr,
llvm::Value *mem_ptr);

// Produce one or more instructions in `in_block` to store the
// native value `native_val` into the lifted state associated
// with `decl`.
llvm::Value *StoreNativeValue(llvm::Value *native_val, const ValueDecl &decl,
const remill::IntrinsicTable &intrinsics,
llvm::BasicBlock *in_block,
llvm::Value *state_ptr, llvm::Value *mem_ptr);

} // namespace anvill
21 changes: 9 additions & 12 deletions include/anvill/Lift.h
Original file line number Diff line number Diff line change
Expand Up @@ -17,31 +17,25 @@

#pragma once

#include <remill/Arch/Instruction.h>
#include <remill/BC/IntrinsicTable.h>
#include <remill/BC/Lifter.h>

#include <memory>
#include <unordered_map>

namespace llvm {
class Module;
} // namespace llvm

namespace remill {
class IntrinsicTable;
class Arch;
} // namespace remill

namespace anvill {

class Program;
struct ValueDecl;

// Manages lifting of machine code functions from the input
// program.
class TraceManager : public remill::TraceManager {
public:
virtual ~TraceManager(void);

static std::unique_ptr<TraceManager> Create(llvm::Module &semantics_module,
const Program &);
};

// Produce one or more instructions in `in_block` to load and return
// the lifted value associated with `decl`.
llvm::Value *LoadLiftedValue(const ValueDecl &decl,
Expand All @@ -57,4 +51,7 @@ llvm::Value *StoreNativeValue(llvm::Value *native_val, const ValueDecl &decl,
llvm::BasicBlock *in_block,
llvm::Value *state_ptr, llvm::Value *mem_ptr);

bool LiftCodeIntoModule(const remill::Arch *arch, const Program &program,
llvm::Module &module);

} // namespace anvill
82 changes: 82 additions & 0 deletions include/anvill/MCToIRLifter.h
Original file line number Diff line number Diff line change
@@ -0,0 +1,82 @@
/*
* Copyright (c) 2020 Trail of Bits, Inc.
*
* This program is free software: you can redistribute it and/or modify
* it under the terms of the GNU Affero General Public License as
* published by the Free Software Foundation, either version 3 of the
* License, or (at your option) any later version.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU Affero General Public License for more details.
*
* You should have received a copy of the GNU Affero General Public License
* along with this program. If not, see <http://www.gnu.org/licenses/>.
*/

#pragma once

#include <remill/Arch/Instruction.h>
#include <remill/BC/IntrinsicTable.h>
#include <remill/BC/Lifter.h>

#include <unordered_map>

namespace llvm {
class BasicBlock;
class Function;
class Module;
class LLVMContext;
} // namespace llvm

namespace remill {
class Arch;
} // namespace remill

namespace anvill {

class Program;

class MCToIRLifter {
private:
const remill::Arch *arch;
const Program &program;
llvm::Module &module;
llvm::LLVMContext &ctx;
remill::IntrinsicTable intrinsics;
remill::InstructionLifter inst_lifter;

// Result maps
std::unordered_map<uint64_t, remill::Instruction> addr_to_inst;
std::unordered_map<uint64_t, llvm::BasicBlock *> addr_to_block;
std::unordered_map<uint64_t, llvm::Function *> addr_to_func;

// Helper
llvm::BasicBlock *GetOrCreateBlock(const uint64_t addr);

// Visitors used to add terminators to instruction basic blocks
void VisitInvalid(remill::Instruction *inst);
void VisitError(remill::Instruction *inst);
void VisitNormal(remill::Instruction *inst);
void VisitNoOp(remill::Instruction *inst);
void VisitDirectJump(remill::Instruction *inst);
void VisitIndirectJump(remill::Instruction *inst);
void VisitFunctionReturn(remill::Instruction *inst);
void VisitDirectFunctionCall(remill::Instruction *inst);
void VisitIndirectFunctionCall(remill::Instruction *inst);
void VisitConditionalBranch(remill::Instruction *inst);
void VisitInstruction(remill::Instruction *inst);
remill::Instruction *DecodeInstruction(const uint64_t addr);
llvm::BasicBlock *LiftInstruction(remill::Instruction *inst);
llvm::Function *LiftFunction(const uint64_t addr);

public:
MCToIRLifter(const remill::Arch *arch, const Program &program,
llvm::Module &module);

llvm::Function *GetOrDeclareFunction(const uint64_t addr);
llvm::Function *GetOrDefineFunction(const uint64_t addr);
};

} // namespace anvill
5 changes: 3 additions & 2 deletions include/anvill/Optimize.h
Original file line number Diff line number Diff line change
Expand Up @@ -19,10 +19,11 @@

namespace llvm {
class Module;
} // namespace llvm
}
namespace remill {
class Arch;
} // namespace remill
}

namespace anvill {

class Program;
Expand Down
1 change: 1 addition & 0 deletions lib/Analyze.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -33,6 +33,7 @@
#include <llvm/IR/Operator.h>
#include <llvm/IR/Type.h>
#include <llvm/Transforms/Utils/Local.h>

// clang-format on

#include <remill/BC/Compat/ScalarTransforms.h>
Expand Down
Loading

0 comments on commit 87adbd6

Please sign in to comment.