Skip to content

Commit

Permalink
Merge pull request #1770 from mandiant/backend-ghidra
Browse files Browse the repository at this point in the history
ghidra: add Ghidra feature extractor and supporting code
  • Loading branch information
mike-hunhoff authored Aug 30, 2023
2 parents f339bbf + d17db61 commit 7b08f2d
Show file tree
Hide file tree
Showing 26 changed files with 2,121 additions and 2 deletions.
3 changes: 3 additions & 0 deletions .github/mypy/mypy.ini
Original file line number Diff line number Diff line change
Expand Up @@ -86,3 +86,6 @@ ignore_missing_imports = True

[mypy-netnode.*]
ignore_missing_imports = True

[mypy-ghidra.*]
ignore_missing_imports = True
61 changes: 61 additions & 0 deletions .github/workflows/tests.yml
Original file line number Diff line number Diff line change
Expand Up @@ -139,3 +139,64 @@ jobs:
env:
BN_LICENSE: ${{ secrets.BN_LICENSE }}
run: pytest -v tests/test_binja_features.py # explicitly refer to the binja tests for performance. other tests run above.

ghidra-tests:
name: Ghidra tests for ${{ matrix.python-version }}
runs-on: ubuntu-20.04
needs: [code_style, rule_linter]
strategy:
fail-fast: false
matrix:
python-version: ["3.8", "3.11"]
java-version: ["17"]
gradle-version: ["7.3"]
ghidra-version: ["10.3"]
public-version: ["PUBLIC_20230510"] # for ghidra releases
jep-version: ["4.1.1"]
ghidrathon-version: ["3.0.0"]
steps:
- name: Checkout capa with submodules
uses: actions/checkout@ac593985615ec2ede58e132d2e21d2b1cbd6127c # v3.3.0
with:
submodules: true
- name: Set up Python ${{ matrix.python-version }}
uses: actions/setup-python@d27e3f3d7c64b4bbf8e4abfb9b63b83e846e0435 # v4.5.0
with:
python-version: ${{ matrix.python-version }}
- name: Set up Java ${{ matrix.java-version }}
uses: actions/setup-java@5ffc13f4174014e2d4d4572b3d74c3fa61aeb2c2 # v3
with:
distribution: 'temurin'
java-version: ${{ matrix.java-version }}
- name: Set up Gradle ${{ matrix.gradle-version }}
uses: gradle/gradle-build-action@40b6781dcdec2762ad36556682ac74e31030cfe2 # v2.5.1
with:
gradle-version: ${{ matrix.gradle-version }}
- name: Install Jep ${{ matrix.jep-version }}
run : pip install jep==${{ matrix.jep-version }}
- name: Install Ghidra ${{ matrix.ghidra-version }}
run: |
mkdir ./.github/ghidra
mkdir ./.github/ghidra/project
wget "https://github.com/NationalSecurityAgency/ghidra/releases/download/Ghidra_${{ matrix.ghidra-version }}_build/ghidra_${{ matrix.ghidra-version }}_${{ matrix.public-version }}.zip" -O ./.github/ghidra/ghidra_${{ matrix.ghidra-version }}_PUBLIC.zip
unzip .github/ghidra/ghidra_${{ matrix.ghidra-version }}_PUBLIC.zip -d .github/ghidra/
- name: Install Ghidrathon
run : |
mkdir ./.github/ghidrathon
mkdir -p ~/.ghidra/.ghidra_${{ matrix.ghidra-version }}_PUBLIC/Extensions
curl -o ./.github/ghidrathon/ghidrathon-${{ matrix.ghidrathon-version }}.zip "https://codeload.github.com/mandiant/Ghidrathon/zip/refs/tags/v${{ matrix.ghidrathon-version }}"
unzip .github/ghidrathon/ghidrathon-${{ matrix.ghidrathon-version }}.zip -d .github/ghidrathon/
workdir=$(pwd)
gradle -p ./.github/ghidrathon/Ghidrathon-${{ matrix.ghidrathon-version }}/ -PGHIDRA_INSTALL_DIR=$workdir/.github/ghidra/ghidra_${{ matrix.ghidra-version }}_PUBLIC
unzip .github/ghidrathon/Ghidrathon-${{ matrix.ghidrathon-version }}/dist/*.zip -d ~/.ghidra/.ghidra_${{ matrix.ghidra-version }}_PUBLIC/Extensions
- name: Install pyyaml
run: sudo apt-get install -y libyaml-dev
- name: Install capa
run: pip install -e .[dev]
- name: Run tests
run: |
.github/ghidra/ghidra_${{ matrix.ghidra-version }}_PUBLIC/support/analyzeHeadless .github/ghidra/project ghidra_test -Import ./tests/data/mimikatz.exe_ -ScriptPath ./tests/ -PostScript test_ghidra_features.py > ../output.log
cat ../output.log
exit_code=$(cat ../output.log | grep exit | awk '{print $NF}')
exit $exit_code
3 changes: 3 additions & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,8 @@
## master (unreleased)

### New Features
- ghidra: add Ghidra feature extractor and supporting code #1770 @colton-gabertan
- ghidra: add entry script helping users run capa against a loaded Ghidra database #1767 @mike-hunhoff

### Breaking Changes

Expand All @@ -11,6 +13,7 @@
-

### Bug Fixes
- ghidra: fix ints_to_bytes performance #1761 @mike-hunhoff

### capa explorer IDA Pro plugin

Expand Down
2 changes: 2 additions & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -170,6 +170,8 @@ capa explorer helps you identify interesting areas of a program and build new ca
![capa + IDA Pro integration](https://github.com/mandiant/capa/blob/master/doc/img/explorer_expanded.png)
If you use Ghidra, you can use the Python 3 [Ghidra feature extractor](/capa/ghidra/). This integration enables capa to extract features directly from your Ghidra database, which can help you identify capabilities in programs that you analyze using Ghidra.
# further information
## capa
- [Installation](https://github.com/mandiant/capa/blob/master/doc/installation.md)
Expand Down
2 changes: 1 addition & 1 deletion capa/features/extractors/elf.py
Original file line number Diff line number Diff line change
Expand Up @@ -898,7 +898,7 @@ def guess_os_from_symtab(elf: ELF) -> Optional[OS]:

def detect_elf_os(f) -> str:
"""
f: type Union[BinaryIO, IDAIO]
f: type Union[BinaryIO, IDAIO, GHIDRAIO]
"""
try:
elf = ELF(f)
Expand Down
Empty file.
152 changes: 152 additions & 0 deletions capa/features/extractors/ghidra/basicblock.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,152 @@
# Copyright (C) 2023 Mandiant, Inc. All Rights Reserved.
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at: [package root]/LICENSE.txt
# Unless required by applicable law or agreed to in writing, software distributed under the License
# is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and limitations under the License.

import string
import struct
from typing import Tuple, Iterator

import ghidra
from ghidra.program.model.lang import OperandType

import capa.features.extractors.ghidra.helpers
from capa.features.common import Feature, Characteristic
from capa.features.address import Address
from capa.features.basicblock import BasicBlock
from capa.features.extractors.helpers import MIN_STACKSTRING_LEN
from capa.features.extractors.base_extractor import BBHandle, FunctionHandle


def get_printable_len(op: ghidra.program.model.scalar.Scalar) -> int:
"""Return string length if all operand bytes are ascii or utf16-le printable"""
op_bit_len = op.bitLength()
op_byte_len = op_bit_len // 8
op_val = op.getValue()

if op_bit_len == 8:
chars = struct.pack("<B", op_val & 0xFF)
elif op_bit_len == 16:
chars = struct.pack("<H", op_val & 0xFFFF)
elif op_bit_len == 32:
chars = struct.pack("<I", op_val & 0xFFFFFFFF)
elif op_bit_len == 64:
chars = struct.pack("<Q", op_val & 0xFFFFFFFFFFFFFFFF)
else:
raise ValueError(f"Unhandled operand data type 0x{op_bit_len:x}.")

def is_printable_ascii(chars_: bytes):
return all(c < 127 and chr(c) in string.printable for c in chars_)

def is_printable_utf16le(chars_: bytes):
if all(c == 0x00 for c in chars_[1::2]):
return is_printable_ascii(chars_[::2])

if is_printable_ascii(chars):
return op_byte_len

if is_printable_utf16le(chars):
return op_byte_len

return 0


def is_mov_imm_to_stack(insn: ghidra.program.database.code.InstructionDB) -> bool:
"""verify instruction moves immediate onto stack"""

# Ghidra will Bitwise OR the OperandTypes to assign multiple
# i.e., the first operand is a stackvar (dynamically allocated),
# and the second is a scalar value (single int/char/float/etc.)
mov_its_ops = [(OperandType.ADDRESS | OperandType.DYNAMIC), OperandType.SCALAR]
found = False

# MOV dword ptr [EBP + local_*], 0x65
if insn.getMnemonicString().startswith("MOV"):
found = all(insn.getOperandType(i) == mov_its_ops[i] for i in range(2))

return found


def bb_contains_stackstring(bb: ghidra.program.model.block.CodeBlock) -> bool:
"""check basic block for stackstring indicators
true if basic block contains enough moves of constant bytes to the stack
"""
count = 0
for insn in currentProgram().getListing().getInstructions(bb, True): # type: ignore [name-defined] # noqa: F821
if is_mov_imm_to_stack(insn):
count += get_printable_len(insn.getScalar(1))
if count > MIN_STACKSTRING_LEN:
return True
return False


def _bb_has_tight_loop(bb: ghidra.program.model.block.CodeBlock):
"""
parse tight loops, true if last instruction in basic block branches to bb start
"""
# Reverse Ordered, first InstructionDB
last_insn = currentProgram().getListing().getInstructions(bb, False).next() # type: ignore [name-defined] # noqa: F821

if last_insn.getFlowType().isJump():
return last_insn.getAddress(0) == bb.getMinAddress()

return False


def extract_bb_stackstring(fh: FunctionHandle, bbh: BBHandle) -> Iterator[Tuple[Feature, Address]]:
"""extract stackstring indicators from basic block"""
bb: ghidra.program.model.block.CodeBlock = bbh.inner

if bb_contains_stackstring(bb):
yield Characteristic("stack string"), bbh.address


def extract_bb_tight_loop(fh: FunctionHandle, bbh: BBHandle) -> Iterator[Tuple[Feature, Address]]:
"""check basic block for tight loop indicators"""
bb: ghidra.program.model.block.CodeBlock = bbh.inner

if _bb_has_tight_loop(bb):
yield Characteristic("tight loop"), bbh.address


BASIC_BLOCK_HANDLERS = (
extract_bb_tight_loop,
extract_bb_stackstring,
)


def extract_features(fh: FunctionHandle, bbh: BBHandle) -> Iterator[Tuple[Feature, Address]]:
"""
extract features from the given basic block.
args:
bb: the basic block to process.
yields:
Tuple[Feature, int]: the features and their location found in this basic block.
"""
yield BasicBlock(), bbh.address
for bb_handler in BASIC_BLOCK_HANDLERS:
for feature, addr in bb_handler(fh, bbh):
yield feature, addr


def main():
features = []
from capa.features.extractors.ghidra.extractor import GhidraFeatureExtractor

for fh in GhidraFeatureExtractor().get_functions():
for bbh in capa.features.extractors.ghidra.helpers.get_function_blocks(fh):
features.extend(list(extract_features(fh, bbh)))

import pprint

pprint.pprint(features) # noqa: T203


if __name__ == "__main__":
main()
75 changes: 75 additions & 0 deletions capa/features/extractors/ghidra/extractor.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,75 @@
# Copyright (C) 2023 Mandiant, Inc. All Rights Reserved.
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at: [package root]/LICENSE.txt
# Unless required by applicable law or agreed to in writing, software distributed under the License
# is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and limitations under the License.
from typing import List, Tuple, Iterator

import capa.features.extractors.ghidra.file
import capa.features.extractors.ghidra.insn
import capa.features.extractors.ghidra.global_
import capa.features.extractors.ghidra.function
import capa.features.extractors.ghidra.basicblock
from capa.features.common import Feature
from capa.features.address import Address, AbsoluteVirtualAddress
from capa.features.extractors.base_extractor import BBHandle, InsnHandle, FunctionHandle, FeatureExtractor


class GhidraFeatureExtractor(FeatureExtractor):
def __init__(self):
super().__init__()
import capa.features.extractors.ghidra.helpers as ghidra_helpers

self.global_features: List[Tuple[Feature, Address]] = []
self.global_features.extend(capa.features.extractors.ghidra.file.extract_file_format())
self.global_features.extend(capa.features.extractors.ghidra.global_.extract_os())
self.global_features.extend(capa.features.extractors.ghidra.global_.extract_arch())
self.imports = ghidra_helpers.get_file_imports()
self.externs = ghidra_helpers.get_file_externs()
self.fakes = ghidra_helpers.map_fake_import_addrs()

def get_base_address(self):
return AbsoluteVirtualAddress(currentProgram().getImageBase().getOffset()) # type: ignore [name-defined] # noqa: F821

def extract_global_features(self):
yield from self.global_features

def extract_file_features(self):
yield from capa.features.extractors.ghidra.file.extract_features()

def get_functions(self) -> Iterator[FunctionHandle]:
import capa.features.extractors.ghidra.helpers as ghidra_helpers

for fhandle in ghidra_helpers.get_function_symbols():
fh: FunctionHandle = FunctionHandle(
address=AbsoluteVirtualAddress(fhandle.getEntryPoint().getOffset()),
inner=fhandle,
ctx={"imports_cache": self.imports, "externs_cache": self.externs, "fakes_cache": self.fakes},
)
yield fh

@staticmethod
def get_function(addr: int) -> FunctionHandle:
func = getFunctionContaining(toAddr(addr)) # type: ignore [name-defined] # noqa: F821
return FunctionHandle(address=AbsoluteVirtualAddress(func.getEntryPoint().getOffset()), inner=func)

def extract_function_features(self, fh: FunctionHandle) -> Iterator[Tuple[Feature, Address]]:
yield from capa.features.extractors.ghidra.function.extract_features(fh)

def get_basic_blocks(self, fh: FunctionHandle) -> Iterator[BBHandle]:
import capa.features.extractors.ghidra.helpers as ghidra_helpers

yield from ghidra_helpers.get_function_blocks(fh)

def extract_basic_block_features(self, fh: FunctionHandle, bbh: BBHandle) -> Iterator[Tuple[Feature, Address]]:
yield from capa.features.extractors.ghidra.basicblock.extract_features(fh, bbh)

def get_instructions(self, fh: FunctionHandle, bbh: BBHandle) -> Iterator[InsnHandle]:
import capa.features.extractors.ghidra.helpers as ghidra_helpers

yield from ghidra_helpers.get_insn_in_range(bbh)

def extract_insn_features(self, fh: FunctionHandle, bbh: BBHandle, ih: InsnHandle):
yield from capa.features.extractors.ghidra.insn.extract_features(fh, bbh, ih)
Loading

0 comments on commit 7b08f2d

Please sign in to comment.