Skip to content

Commit

Permalink
arm: add support for arm architecture and improve elf files
Browse files Browse the repository at this point in the history
  • Loading branch information
gauthier-wiemann committed Oct 4, 2023
1 parent b1175ab commit e059407
Show file tree
Hide file tree
Showing 15 changed files with 1,981 additions and 41 deletions.
5 changes: 5 additions & 0 deletions .github/pyinstaller/hooks/hook-vivisect.py
Original file line number Diff line number Diff line change
Expand Up @@ -40,6 +40,10 @@
"vivisect.analysis.amd64",
"vivisect.analysis.amd64.emulation",
"vivisect.analysis.amd64.golang",
"vivisect.analysis.arm",
"vivisect.analysis.arm.emulation",
"vivisect.analysis.arm.renaming",
"vivisect.analysis.arm.thunk_reg",
"vivisect.analysis.crypto",
"vivisect.analysis.crypto.constants",
"vivisect.analysis.elf",
Expand Down Expand Up @@ -76,6 +80,7 @@
"vivisect.analysis.ms.vftables",
"vivisect.analysis.pe",
"vivisect.impapi.posix.amd64",
"vivisect.impapi.posix.arm",
"vivisect.impapi.posix.i386",
"vivisect.impapi.windows",
"vivisect.impapi.windows.advapi_32",
Expand Down
4 changes: 3 additions & 1 deletion CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -5,8 +5,10 @@
### New Features
- ghidra: add Ghidra feature extractor and supporting code #1770 @colton-gabertan
- ghidra: add entry script helping users run capa against a loaded Ghidra database #1767 @mike-hunhoff
- binja: add support for forwarded exports #1646 @xusheng6
- binja: add support for symtab names #1504 @xusheng6
- ARM: add support for ARM architecture
- ELF: improve ELF stripper
- ELF: improve statically linked ELF files analysis

### Breaking Changes

Expand Down
3 changes: 2 additions & 1 deletion capa/features/common.py
Original file line number Diff line number Diff line change
Expand Up @@ -407,9 +407,10 @@ def get_value_str(self):
# other candidates here: https://docs.microsoft.com/en-us/windows/win32/debug/pe-format#machine-types
ARCH_I386 = "i386"
ARCH_AMD64 = "amd64"
ARCH_ARM = "ARM"
# dotnet
ARCH_ANY = "any"
VALID_ARCH = (ARCH_I386, ARCH_AMD64, ARCH_ANY)
VALID_ARCH = (ARCH_I386, ARCH_AMD64, ARCH_ARM, ARCH_ANY)


class Arch(Feature):
Expand Down
2 changes: 1 addition & 1 deletion capa/features/extractors/elf.py
Original file line number Diff line number Diff line change
Expand Up @@ -978,7 +978,7 @@ def detect_elf_os(f) -> str:
elif symtab_guess:
ret = symtab_guess

return ret.value if ret is not None else "unknown"
return ret.value if ret is not None else "linux"


def detect_elf_arch(f: BinaryIO) -> str:
Expand Down
2 changes: 2 additions & 0 deletions capa/features/extractors/elffile.py
Original file line number Diff line number Diff line change
Expand Up @@ -122,6 +122,8 @@ def extract_file_arch(elf: ELFFile, **kwargs):
yield Arch("i386"), NO_ADDRESS
elif arch == "x64":
yield Arch("amd64"), NO_ADDRESS
elif arch == "ARM":
yield Arch("ARM"), NO_ADDRESS
else:
logger.warning("unsupported architecture: %s", arch)

Expand Down
20 changes: 13 additions & 7 deletions capa/features/extractors/viv/basicblock.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,9 +8,10 @@

import string
import struct
from typing import Tuple, Iterator
from typing import Tuple, Union, Iterator

import envi
import envi.archs.arm.disasm
import envi.archs.i386.disasm

from capa.features.common import Feature, Characteristic
Expand Down Expand Up @@ -76,7 +77,7 @@ def extract_stackstring(f: FunctionHandle, bb: BBHandle) -> Iterator[Tuple[Featu
yield Characteristic("stack string"), bb.address


def is_mov_imm_to_stack(instr: envi.archs.i386.disasm.i386Opcode) -> bool:
def is_mov_imm_to_stack(instr: Union[envi.archs.i386.disasm.i386Opcode, envi.archs.arm.disasm.ArmOpcode]) -> bool:
"""
Return if instruction moves immediate onto stack
"""
Expand All @@ -92,22 +93,27 @@ def is_mov_imm_to_stack(instr: envi.archs.i386.disasm.i386Opcode) -> bool:
if not src.isImmed():
return False

if not isinstance(dst, envi.archs.i386.disasm.i386SibOper) and not isinstance(
dst, envi.archs.i386.disasm.i386RegMemOper
if (
not isinstance(dst, envi.archs.i386.disasm.i386SibOper)
and not isinstance(dst, envi.archs.i386.disasm.i386RegMemOper)
and not isinstance(dst, envi.archs.arm.disasm.ArmRegOper)
):
return False

if not dst.reg:
return False

rname = dst._dis_regctx.getRegisterName(dst.reg)
if rname not in ["ebp", "rbp", "esp", "rsp"]:
if isinstance(dst, (envi.archs.i386.disasm.i386SibOper, envi.archs.i386.disasm.i386RegMemOper)):
rname = dst._dis_regctx.getRegisterName(dst.reg)
else:
rname = dst.reg
if rname not in ["ebp", "rbp", "esp", "rsp", envi.archs.arm.disasm.REG_SP, envi.archs.arm.disasm.REG_BP]:
return False

return True


def get_printable_len(oper: envi.archs.i386.disasm.i386ImmOper) -> int:
def get_printable_len(oper: Union[envi.archs.i386.disasm.i386ImmOper, envi.archs.arm.disasm.ArmImmOper]) -> int:
"""
Return string length if all operand bytes are ascii or utf16-le printable
"""
Expand Down
9 changes: 7 additions & 2 deletions capa/features/extractors/viv/extractor.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,6 +17,7 @@
import capa.features.extractors.viv.insn
import capa.features.extractors.viv.global_
import capa.features.extractors.viv.function
import capa.features.extractors.viv.insn_arm
import capa.features.extractors.viv.basicblock
from capa.features.common import Feature
from capa.features.address import Address, AbsoluteVirtualAddress
Expand All @@ -26,10 +27,11 @@


class VivisectFeatureExtractor(FeatureExtractor):
def __init__(self, vw, path: Path, os):
def __init__(self, vw, path: Path, os, arm=False):
super().__init__()
self.vw = vw
self.path = path
self.arm = arm
self.buf = path.read_bytes()

# pre-compute these because we'll yield them at *every* scope.
Expand Down Expand Up @@ -74,7 +76,10 @@ def get_instructions(self, fh: FunctionHandle, bbh: BBHandle) -> Iterator[InsnHa
def extract_insn_features(
self, fh: FunctionHandle, bbh: BBHandle, ih: InsnHandle
) -> Iterator[Tuple[Feature, Address]]:
yield from capa.features.extractors.viv.insn.extract_features(fh, bbh, ih)
if self.arm:
yield from capa.features.extractors.viv.insn_arm.extract_features(fh, bbh, ih)
else:
yield from capa.features.extractors.viv.insn.extract_features(fh, bbh, ih)

def is_library_function(self, addr):
return viv_utils.flirt.is_library_function(self.vw, addr)
Expand Down
2 changes: 1 addition & 1 deletion capa/features/extractors/viv/function.py
Original file line number Diff line number Diff line change
Expand Up @@ -80,7 +80,7 @@ def extract_function_loop(fhandle: FunctionHandle) -> Iterator[Tuple[Feature, Ad
bflags & envi.BR_COND
or bflags & envi.BR_FALL
or bflags & envi.BR_TABLE
or bb.instructions[-1].mnem == "jmp"
or bb.instructions[-1].mnem in ["jmp", "b", "bx"]
):
edges.append((bb.va, bva))

Expand Down
5 changes: 4 additions & 1 deletion capa/features/extractors/viv/global_.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,7 @@
import logging
from typing import Tuple, Iterator

from capa.features.common import ARCH_I386, ARCH_AMD64, Arch, Feature
from capa.features.common import ARCH_ARM, ARCH_I386, ARCH_AMD64, Arch, Feature
from capa.features.address import NO_ADDRESS, Address

logger = logging.getLogger(__name__)
Expand All @@ -22,6 +22,9 @@ def extract_arch(vw) -> Iterator[Tuple[Feature, Address]]:
elif arch == "i386":
yield Arch(ARCH_I386), NO_ADDRESS

elif arch == "ARM":
yield Arch(ARCH_ARM), NO_ADDRESS

else:
# we likely end up here:
# 1. handling a new architecture (e.g. aarch64)
Expand Down
17 changes: 17 additions & 0 deletions capa/features/extractors/viv/helpers.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,7 @@
# See the License for the specific language governing permissions and limitations under the License.
from typing import Optional

import envi
from vivisect import VivWorkspace
from vivisect.const import XR_TO, REF_CODE

Expand All @@ -21,3 +22,19 @@ def get_coderef_from(vw: VivWorkspace, va: int) -> Optional[int]:
return xrefs[0][XR_TO]
else:
return None


def read_memory(vw, va: int, size: int) -> bytes:
# as documented in #176, vivisect will not readMemory() when the section is not marked readable.
#
# but here, we don't care about permissions.
# so, copy the viv implementation of readMemory and remove the permissions check.
#
# this is derived from:
# https://github.com/vivisect/vivisect/blob/5eb4d237bddd4069449a6bc094d332ceed6f9a96/envi/memory.py#L453-L462
for mva, mmaxva, mmap, mbytes in vw._map_defs:
if va >= mva and va < mmaxva:
mva, msize, mperms, mfname = mmap
offset = va - mva
return mbytes[offset : offset + size]
raise envi.exc.SegmentationViolation(va)
58 changes: 53 additions & 5 deletions capa/features/extractors/viv/indirect_calls.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,7 @@

import envi
import vivisect.const
import envi.archs.arm.disasm
import envi.archs.i386.disasm
import envi.archs.amd64.disasm
from vivisect import VivWorkspace
Expand All @@ -20,12 +21,15 @@
i386ImmOper = envi.archs.i386.disasm.i386ImmOper
i386ImmMemOper = envi.archs.i386.disasm.i386ImmMemOper
Amd64RipRelOper = envi.archs.amd64.disasm.Amd64RipRelOper
ARMRegOper = envi.archs.arm.disasm.ArmRegOper
ARMImmOper = envi.archs.arm.disasm.ArmImmOper
ARMScaledOffsetOper = envi.archs.arm.disasm.ArmScaledOffsetOper
LOC_OP = vivisect.const.LOC_OP
IF_NOFALL = envi.IF_NOFALL
REF_CODE = vivisect.const.REF_CODE
FAR_BRANCH_MASK = envi.BR_PROC | envi.BR_DEREF | envi.BR_ARCH

DESTRUCTIVE_MNEMONICS = ("mov", "lea", "pop", "xor")
DESTRUCTIVE_MNEMONICS = ("mov", "lea", "ldr", "pop", "xor", "eor")


def get_previous_instructions(vw: VivWorkspace, va: int) -> List[int]:
Expand Down Expand Up @@ -71,6 +75,38 @@ class NotFoundError(Exception):
pass


def find_value(vw: VivWorkspace, va: int, reg: int, q):
tmp = 0
seen = set([]) # type: Set[int]

q.extend(get_previous_instructions(vw, va))
while q:
cur = q.popleft()
if cur in seen:
continue
seen.add(cur)
insn = vw.parseOpcode(cur)
if len(insn.opers) == 0:
q.extend(get_previous_instructions(vw, cur))
continue

opnd0 = insn.opers[0]
if not (isinstance(opnd0, ARMRegOper) and opnd0.reg == reg):
q.extend(get_previous_instructions(vw, cur))
continue
if insn.mnem == "sub" and isinstance(insn.opers[1], ARMImmOper):
tmp -= insn.opers[1].val
q.extend(get_previous_instructions(vw, cur))
continue
if insn.mnem == "add" and isinstance(insn.opers[1], ARMImmOper):
tmp += insn.opers[1].val
q.extend(get_previous_instructions(vw, cur))
continue
if insn.mnem == "mov" and isinstance(insn.opers[1], ARMImmOper):
return insn.opers[1].val + tmp
return None


def find_definition(vw: VivWorkspace, va: int, reg: int) -> Tuple[int, Optional[int]]:
"""
scan backwards from the given address looking for assignments to the given register.
Expand Down Expand Up @@ -106,7 +142,9 @@ def find_definition(vw: VivWorkspace, va: int, reg: int) -> Tuple[int, Optional[
continue

opnd0 = insn.opers[0]
if not (isinstance(opnd0, i386RegOper) and opnd0.reg == reg and insn.mnem in DESTRUCTIVE_MNEMONICS):
if not (
isinstance(opnd0, (i386RegOper, ARMRegOper)) and opnd0.reg == reg and insn.mnem in DESTRUCTIVE_MNEMONICS
):
q.extend(get_previous_instructions(vw, cur))
continue

Expand All @@ -115,16 +153,24 @@ def find_definition(vw: VivWorkspace, va: int, reg: int) -> Tuple[int, Optional[
# we currently only support extracting the constant from something like: `mov $reg, IAT`
# so, any other pattern results in an unknown value, represented by None.
# this is a good place to extend in the future, if we need more robust support.
if insn.mnem != "mov":
if insn.mnem not in ("mov", "ldr"):
return (cur, None)
else:
opnd1 = insn.opers[1]
if isinstance(opnd1, i386ImmOper):
if isinstance(opnd1, (i386ImmOper, ARMImmOper)):
return (cur, opnd1.getOperValue(opnd1))
elif isinstance(opnd1, i386ImmMemOper):
return (cur, opnd1.getOperAddr(opnd1))
elif isinstance(opnd1, Amd64RipRelOper):
return (cur, opnd1.getOperAddr(insn))
elif isinstance(opnd1, ARMScaledOffsetOper):
base_reg = find_value(vw, cur, opnd1.base_reg, q)
if base_reg is None:
return (cur, None)
offset_reg = find_value(vw, cur, opnd1.offset_reg, q)
if offset_reg is None:
return (cur, None)
return (cur, base_reg + offset_reg)
else:
# might be something like: `mov $reg, dword_401000[eax]`
return (cur, None)
Expand All @@ -136,7 +182,9 @@ def is_indirect_call(vw: VivWorkspace, va: int, insn: envi.Opcode) -> bool:
if insn is None:
insn = vw.parseOpcode(va)

return insn.mnem in ("call", "jmp") and isinstance(insn.opers[0], envi.archs.i386.disasm.i386RegOper)
return insn.mnem in ("call", "jmp", "bl", "blx", "b", "bx") and isinstance(
insn.opers[0], (envi.archs.i386.disasm.i386RegOper, envi.archs.arm.disasm.ArmRegOper)
)


def resolve_indirect_call(vw: VivWorkspace, va: int, insn: envi.Opcode) -> Tuple[int, Optional[int]]:
Expand Down
27 changes: 11 additions & 16 deletions capa/features/extractors/viv/insn.py
Original file line number Diff line number Diff line change
Expand Up @@ -23,6 +23,8 @@
from capa.features.common import MAX_BYTES_FEATURE_SIZE, THUNK_CHAIN_DEPTH_DELTA, Bytes, String, Feature, Characteristic
from capa.features.address import Address, AbsoluteVirtualAddress
from capa.features.extractors.elf import SymTab
from capa.features.extractors.viv.helpers import read_memory
from capa.features.extractors.viv.syscall import get_library_function_name
from capa.features.extractors.base_extractor import BBHandle, InsnHandle, FunctionHandle
from capa.features.extractors.viv.indirect_calls import NotFoundError, resolve_indirect_call

Expand Down Expand Up @@ -81,6 +83,15 @@ def extract_insn_api_features(fh: FunctionHandle, bb, ih: InsnHandle) -> Iterato
if f.vw.getFunctionMeta(f.va, "Thunk"):
return

# Added a case for catching basic blocks that contain direct calls to system functions.
if insn.mnem in ("int", "syscall"):
if insn.mnem != "int" or insn.opers[0].imm == 128:
name = get_library_function_name(f.vw, bb)
if name is None:
return
yield API(name), ih.address
return

# traditional call via IAT
if isinstance(insn.opers[0], envi.archs.i386.disasm.i386ImmMemOper):
oper = insn.opers[0]
Expand Down Expand Up @@ -222,22 +233,6 @@ def derefs(vw, p):
p = next


def read_memory(vw, va: int, size: int) -> bytes:
# as documented in #176, vivisect will not readMemory() when the section is not marked readable.
#
# but here, we don't care about permissions.
# so, copy the viv implementation of readMemory and remove the permissions check.
#
# this is derived from:
# https://github.com/vivisect/vivisect/blob/5eb4d237bddd4069449a6bc094d332ceed6f9a96/envi/memory.py#L453-L462
for mva, mmaxva, mmap, mbytes in vw._map_defs:
if va >= mva and va < mmaxva:
mva, msize, mperms, mfname = mmap
offset = va - mva
return mbytes[offset : offset + size]
raise envi.exc.SegmentationViolation(va)


def read_bytes(vw, va: int) -> bytes:
"""
read up to MAX_BYTES_FEATURE_SIZE from the given address.
Expand Down
Loading

0 comments on commit e059407

Please sign in to comment.