Skip to content

Commit

Permalink
Fix VMRay missing process data (#2396)
Browse files Browse the repository at this point in the history
* get all processes, see #2394

* add tests for process recording

* rename symbols for clarification

* handle single and list entries

* update changelog

* dynamic: vmray: use monitor IDs to track processes and threads

* dynamic: vmray: code refactor

* dynamic: vmray: add sanity checks when processing monitor processes

* dynamic: vmray: remove unnecessary keys() access

* dynamic: vmray: clarify comments

* Update CHANGELOG.md

Co-authored-by: Willi Ballenthin <[email protected]>

* dynamic: vmray: update CHANGELOG

---------

Co-authored-by: Mike Hunhoff <[email protected]>
Co-authored-by: Willi Ballenthin <[email protected]>
  • Loading branch information
3 people authored Sep 26, 2024
1 parent c48bccf commit 06271a8
Show file tree
Hide file tree
Showing 7 changed files with 158 additions and 79 deletions.
1 change: 1 addition & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -26,6 +26,7 @@

- use Python 3.12 to build extra standalone build on Linux #2383 @williballenthin
- bump minimum Python version to 3.8.1 to satisfy uv #2387 @williballenthin
- vmray: collect more process information from flog.xml #2394 @mr-tz @mike-hunhoff

### capa explorer IDA Pro plugin

Expand Down
95 changes: 66 additions & 29 deletions capa/features/extractors/vmray/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,7 @@
from pathlib import Path
from zipfile import ZipFile
from collections import defaultdict
from dataclasses import dataclass

from capa.exceptions import UnsupportedFormatError
from capa.features.extractors.vmray.models import File, Flog, SummaryV2, StaticData, FunctionCall, xml_to_dict
Expand All @@ -21,6 +22,21 @@
SUPPORTED_FLOG_VERSIONS = ("2",)


@dataclass
class VMRayMonitorThread:
tid: int # thread ID assigned by OS
monitor_id: int # unique ID assigned to thread by VMRay
process_monitor_id: int # unqiue ID assigned to containing process by VMRay


@dataclass
class VMRayMonitorProcess:
pid: int # process ID assigned by OS
ppid: int # parent process ID assigned by OS
monitor_id: int # unique ID assigned to process by VMRay
image_name: str


class VMRayAnalysis:
def __init__(self, zipfile_path: Path):
self.zipfile = ZipFile(zipfile_path, "r")
Expand All @@ -45,9 +61,15 @@ def __init__(self, zipfile_path: Path):
self.exports: Dict[int, str] = {}
self.imports: Dict[int, Tuple[str, str]] = {}
self.sections: Dict[int, str] = {}
self.process_ids: Dict[int, int] = {}
self.process_threads: Dict[int, List[int]] = defaultdict(list)
self.process_calls: Dict[int, Dict[int, List[FunctionCall]]] = defaultdict(lambda: defaultdict(list))
self.monitor_processes: Dict[int, VMRayMonitorProcess] = {}
self.monitor_threads: Dict[int, VMRayMonitorThread] = {}

# map monitor thread IDs to their associated monitor process ID
self.monitor_threads_by_monitor_process: Dict[int, List[int]] = defaultdict(list)

# map function calls to their associated monitor thread ID mapped to its associated monitor process ID
self.monitor_process_calls: Dict[int, Dict[int, List[FunctionCall]]] = defaultdict(lambda: defaultdict(list))

self.base_address: int

self.sample_file_name: Optional[str] = None
Expand Down Expand Up @@ -79,13 +101,14 @@ def __init__(self, zipfile_path: Path):

self.sample_file_buf: bytes = self.zipfile.read(sample_file_path, pwd=DEFAULT_ARCHIVE_PASSWORD)

# do not change order, it matters
self._compute_base_address()
self._compute_imports()
self._compute_exports()
self._compute_sections()
self._compute_process_ids()
self._compute_process_threads()
self._compute_process_calls()
self._compute_monitor_processes()
self._compute_monitor_threads()
self._compute_monitor_process_calls()

def _find_sample_file(self):
for file_name, file_analysis in self.sv2.files.items():
Expand Down Expand Up @@ -128,34 +151,48 @@ def _compute_sections(self):
for elffile_section in self.sample_file_static_data.elf.sections:
self.sections[elffile_section.header.sh_addr] = elffile_section.header.sh_name

def _compute_process_ids(self):
def _compute_monitor_processes(self):
for process in self.sv2.processes.values():
# we expect VMRay's monitor IDs to be unique, but OS PIDs may be reused
assert process.monitor_id not in self.process_ids.keys()
self.process_ids[process.monitor_id] = process.os_pid
# we expect monitor IDs to be unique
assert process.monitor_id not in self.monitor_processes

def _compute_process_threads(self):
# logs/flog.xml appears to be the only file that contains thread-related data
# so we use it here to map processes to threads
for function_call in self.flog.analysis.function_calls:
pid: int = self.get_process_os_pid(function_call.process_id) # flog.xml uses process monitor ID, not OS PID
tid: int = function_call.thread_id
ppid: int = (
self.sv2.processes[process.ref_parent_process.path[1]].os_pid if process.ref_parent_process else 0
)
self.monitor_processes[process.monitor_id] = VMRayMonitorProcess(
process.os_pid, ppid, process.monitor_id, process.image_name
)

assert isinstance(pid, int)
assert isinstance(tid, int)
# not all processes are recorded in SummaryV2.json, get missing data from flog.xml, see #2394
for monitor_process in self.flog.analysis.monitor_processes:
vmray_monitor_process: VMRayMonitorProcess = VMRayMonitorProcess(
monitor_process.os_pid,
monitor_process.os_parent_pid,
monitor_process.process_id,
monitor_process.image_name,
)

if tid not in self.process_threads[pid]:
self.process_threads[pid].append(tid)
if monitor_process.process_id not in self.monitor_processes:
self.monitor_processes[monitor_process.process_id] = vmray_monitor_process
else:
# we expect monitor processes recorded in both SummaryV2.json and flog.xml to equal
assert self.monitor_processes[monitor_process.process_id] == vmray_monitor_process

def _compute_process_calls(self):
for function_call in self.flog.analysis.function_calls:
pid: int = self.get_process_os_pid(function_call.process_id) # flog.xml uses process monitor ID, not OS PID
tid: int = function_call.thread_id
def _compute_monitor_threads(self):
for monitor_thread in self.flog.analysis.monitor_threads:
# we expect monitor IDs to be unique
assert monitor_thread.thread_id not in self.monitor_threads

assert isinstance(pid, int)
assert isinstance(tid, int)
self.monitor_threads[monitor_thread.thread_id] = VMRayMonitorThread(
monitor_thread.os_tid, monitor_thread.thread_id, monitor_thread.process_id
)

# we expect each monitor thread ID to be unique for its associated monitor process ID e.g. monitor
# thread ID 10 should not be captured twice for monitor process ID 1
assert monitor_thread.thread_id not in self.monitor_threads_by_monitor_process[monitor_thread.thread_id]

self.process_calls[pid][tid].append(function_call)
self.monitor_threads_by_monitor_process[monitor_thread.process_id].append(monitor_thread.thread_id)

def get_process_os_pid(self, monitor_id: int) -> int:
return self.process_ids[monitor_id]
def _compute_monitor_process_calls(self):
for function_call in self.flog.analysis.function_calls:
self.monitor_process_calls[function_call.process_id][function_call.thread_id].append(function_call)
31 changes: 21 additions & 10 deletions capa/features/extractors/vmray/extractor.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,9 +15,16 @@
import capa.features.extractors.vmray.file
import capa.features.extractors.vmray.global_
from capa.features.common import Feature, Characteristic
from capa.features.address import NO_ADDRESS, Address, ThreadAddress, DynamicCallAddress, AbsoluteVirtualAddress
from capa.features.extractors.vmray import VMRayAnalysis
from capa.features.extractors.vmray.models import PARAM_TYPE_STR, Process, ParamList, FunctionCall
from capa.features.address import (
NO_ADDRESS,
Address,
ThreadAddress,
ProcessAddress,
DynamicCallAddress,
AbsoluteVirtualAddress,
)
from capa.features.extractors.vmray import VMRayAnalysis, VMRayMonitorThread, VMRayMonitorProcess
from capa.features.extractors.vmray.models import PARAM_TYPE_STR, ParamList, FunctionCall
from capa.features.extractors.base_extractor import (
CallHandle,
SampleHashes,
Expand Down Expand Up @@ -69,20 +76,24 @@ def extract_global_features(self) -> Iterator[Tuple[Feature, Address]]:
yield from self.global_features

def get_processes(self) -> Iterator[ProcessHandle]:
yield from capa.features.extractors.vmray.file.get_processes(self.analysis)
for monitor_process in self.analysis.monitor_processes.values():
address: ProcessAddress = ProcessAddress(pid=monitor_process.pid, ppid=monitor_process.ppid)
yield ProcessHandle(address, inner=monitor_process)

def extract_process_features(self, ph: ProcessHandle) -> Iterator[Tuple[Feature, Address]]:
# we have not identified process-specific features for VMRay yet
yield from []

def get_process_name(self, ph) -> str:
process: Process = ph.inner
return process.image_name
monitor_process: VMRayMonitorProcess = ph.inner
return monitor_process.image_name

def get_threads(self, ph: ProcessHandle) -> Iterator[ThreadHandle]:
for thread in self.analysis.process_threads[ph.address.pid]:
address: ThreadAddress = ThreadAddress(process=ph.address, tid=thread)
yield ThreadHandle(address=address, inner={})
for monitor_thread_id in self.analysis.monitor_threads_by_monitor_process[ph.inner.monitor_id]:
monitor_thread: VMRayMonitorThread = self.analysis.monitor_threads[monitor_thread_id]

address: ThreadAddress = ThreadAddress(process=ph.address, tid=monitor_thread.tid)
yield ThreadHandle(address=address, inner=monitor_thread)

def extract_thread_features(self, ph: ProcessHandle, th: ThreadHandle) -> Iterator[Tuple[Feature, Address]]:
if False:
Expand All @@ -92,7 +103,7 @@ def extract_thread_features(self, ph: ProcessHandle, th: ThreadHandle) -> Iterat
return

def get_calls(self, ph: ProcessHandle, th: ThreadHandle) -> Iterator[CallHandle]:
for function_call in self.analysis.process_calls[ph.address.pid][th.address.tid]:
for function_call in self.analysis.monitor_process_calls[ph.inner.monitor_id][th.inner.monitor_id]:
addr = DynamicCallAddress(thread=th.address, id=function_call.fncall_id)
yield CallHandle(address=addr, inner=function_call)

Expand Down
23 changes: 2 additions & 21 deletions capa/features/extractors/vmray/file.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,37 +6,18 @@
# is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and limitations under the License.
import logging
from typing import Dict, Tuple, Iterator
from typing import Tuple, Iterator

import capa.features.extractors.common
from capa.features.file import Export, Import, Section
from capa.features.common import String, Feature
from capa.features.address import NO_ADDRESS, Address, ProcessAddress, AbsoluteVirtualAddress
from capa.features.address import NO_ADDRESS, Address, AbsoluteVirtualAddress
from capa.features.extractors.vmray import VMRayAnalysis
from capa.features.extractors.helpers import generate_symbols
from capa.features.extractors.vmray.models import Process
from capa.features.extractors.base_extractor import ProcessHandle

logger = logging.getLogger(__name__)


def get_processes(analysis: VMRayAnalysis) -> Iterator[ProcessHandle]:
processes: Dict[str, Process] = analysis.sv2.processes

for process in processes.values():
# we map VMRay's monitor ID to the OS PID to make it easier for users
# to follow the processes in capa's output
pid: int = analysis.get_process_os_pid(process.monitor_id)
ppid: int = (
analysis.get_process_os_pid(processes[process.ref_parent_process.path[1]].monitor_id)
if process.ref_parent_process
else 0
)

addr: ProcessAddress = ProcessAddress(pid=pid, ppid=ppid)
yield ProcessHandle(address=addr, inner=process)


def extract_export_names(analysis: VMRayAnalysis) -> Iterator[Tuple[Feature, Address]]:
for addr, name in analysis.exports.items():
yield Export(name), AbsoluteVirtualAddress(addr)
Expand Down
40 changes: 37 additions & 3 deletions capa/features/extractors/vmray/models.py
Original file line number Diff line number Diff line change
Expand Up @@ -87,7 +87,7 @@ class Param(BaseModel):
deref: Optional[ParamDeref] = None


def validate_param_list(value: Union[List[Param], Param]) -> List[Param]:
def validate_ensure_is_list(value: Union[List[Param], Param]) -> List[Param]:
if isinstance(value, list):
return value
else:
Expand All @@ -97,7 +97,7 @@ def validate_param_list(value: Union[List[Param], Param]) -> List[Param]:
# params may be stored as a list of Param or a single Param so we convert
# the input value to Python list type before the inner validation (List[Param])
# is called
ParamList = Annotated[List[Param], BeforeValidator(validate_param_list)]
ParamList = Annotated[List[Param], BeforeValidator(validate_ensure_is_list)]


class Params(BaseModel):
Expand Down Expand Up @@ -137,12 +137,46 @@ class FunctionReturn(BaseModel):
from_addr: HexInt = Field(alias="from")


class MonitorProcess(BaseModel):
ts: HexInt
process_id: int
image_name: str
filename: str
# page_root: HexInt
os_pid: HexInt
# os_integrity_level: HexInt
# os_privileges: HexInt
monitor_reason: str
parent_id: int
os_parent_pid: HexInt
# cmd_line: str
# cur_dir: str
# os_username: str
# bitness: int
# os_groups: str


class MonitorThread(BaseModel):
ts: HexInt
thread_id: int
process_id: int
os_tid: HexInt


# handle if there's only single entries, but the model expects a list
MonitorProcessList = Annotated[List[MonitorProcess], BeforeValidator(validate_ensure_is_list)]
MonitorThreadList = Annotated[List[MonitorThread], BeforeValidator(validate_ensure_is_list)]
FunctionCallList = Annotated[List[FunctionCall], BeforeValidator(validate_ensure_is_list)]


class Analysis(BaseModel):
log_version: str # tested 2
analyzer_version: str # tested 2024.2.1
# analysis_date: str

function_calls: List[FunctionCall] = Field(alias="fncall", default=[])
monitor_processes: MonitorProcessList = Field(alias="monitor_process", default=[])
monitor_threads: MonitorThreadList = Field(alias="monitor_thread", default=[])
function_calls: FunctionCallList = Field(alias="fncall", default=[])
# function_returns: List[FunctionReturn] = Field(alias="fnret", default=[])


Expand Down
8 changes: 8 additions & 0 deletions tests/fixtures.py
Original file line number Diff line number Diff line change
Expand Up @@ -431,6 +431,14 @@ def get_data_path_by_name(name) -> Path:
/ "vmray"
/ "93b2d1840566f45fab674ebc79a9d19c88993bcb645e0357f3cb584d16e7c795_min_archive.zip"
)
elif name.startswith("2f8a79-vmray"):
return (
CD
/ "data"
/ "dynamic"
/ "vmray"
/ "2f8a79b12a7a989ac7e5f6ec65050036588a92e65aeb6841e08dc228ff0e21b4_min_archive.zip"
)
elif name.startswith("ea2876"):
return CD / "data" / "ea2876e9175410b6f6719f80ee44b9553960758c7d0f7bed73c0fe9a78d8e669.dll_"
elif name.startswith("1038a2"):
Expand Down
Loading

0 comments on commit 06271a8

Please sign in to comment.