From 06271a88d4ad17c6502e76fd168e316442b29634 Mon Sep 17 00:00:00 2001 From: Moritz Date: Thu, 26 Sep 2024 21:57:30 +0200 Subject: [PATCH] Fix VMRay missing process data (#2396) * get all processes, see #2394 * add tests for process recording * rename symbols for clarification * handle single and list entries * update changelog * dynamic: vmray: use monitor IDs to track processes and threads * dynamic: vmray: code refactor * dynamic: vmray: add sanity checks when processing monitor processes * dynamic: vmray: remove unnecessary keys() access * dynamic: vmray: clarify comments * Update CHANGELOG.md Co-authored-by: Willi Ballenthin * dynamic: vmray: update CHANGELOG --------- Co-authored-by: Mike Hunhoff Co-authored-by: Willi Ballenthin --- CHANGELOG.md | 1 + capa/features/extractors/vmray/__init__.py | 95 ++++++++++++++------- capa/features/extractors/vmray/extractor.py | 31 ++++--- capa/features/extractors/vmray/file.py | 23 +---- capa/features/extractors/vmray/models.py | 40 ++++++++- tests/fixtures.py | 8 ++ tests/test_vmray_features.py | 39 +++++---- 7 files changed, 158 insertions(+), 79 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 6cd8b487c..2edd08c5c 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -26,6 +26,7 @@ - use Python 3.12 to build extra standalone build on Linux #2383 @williballenthin - bump minimum Python version to 3.8.1 to satisfy uv #2387 @williballenthin +- vmray: collect more process information from flog.xml #2394 @mr-tz @mike-hunhoff ### capa explorer IDA Pro plugin diff --git a/capa/features/extractors/vmray/__init__.py b/capa/features/extractors/vmray/__init__.py index 06d581cc9..4a004af61 100644 --- a/capa/features/extractors/vmray/__init__.py +++ b/capa/features/extractors/vmray/__init__.py @@ -10,6 +10,7 @@ from pathlib import Path from zipfile import ZipFile from collections import defaultdict +from dataclasses import dataclass from capa.exceptions import UnsupportedFormatError from capa.features.extractors.vmray.models import File, Flog, SummaryV2, StaticData, FunctionCall, xml_to_dict @@ -21,6 +22,21 @@ SUPPORTED_FLOG_VERSIONS = ("2",) +@dataclass +class VMRayMonitorThread: + tid: int # thread ID assigned by OS + monitor_id: int # unique ID assigned to thread by VMRay + process_monitor_id: int # unqiue ID assigned to containing process by VMRay + + +@dataclass +class VMRayMonitorProcess: + pid: int # process ID assigned by OS + ppid: int # parent process ID assigned by OS + monitor_id: int # unique ID assigned to process by VMRay + image_name: str + + class VMRayAnalysis: def __init__(self, zipfile_path: Path): self.zipfile = ZipFile(zipfile_path, "r") @@ -45,9 +61,15 @@ def __init__(self, zipfile_path: Path): self.exports: Dict[int, str] = {} self.imports: Dict[int, Tuple[str, str]] = {} self.sections: Dict[int, str] = {} - self.process_ids: Dict[int, int] = {} - self.process_threads: Dict[int, List[int]] = defaultdict(list) - self.process_calls: Dict[int, Dict[int, List[FunctionCall]]] = defaultdict(lambda: defaultdict(list)) + self.monitor_processes: Dict[int, VMRayMonitorProcess] = {} + self.monitor_threads: Dict[int, VMRayMonitorThread] = {} + + # map monitor thread IDs to their associated monitor process ID + self.monitor_threads_by_monitor_process: Dict[int, List[int]] = defaultdict(list) + + # map function calls to their associated monitor thread ID mapped to its associated monitor process ID + self.monitor_process_calls: Dict[int, Dict[int, List[FunctionCall]]] = defaultdict(lambda: defaultdict(list)) + self.base_address: int self.sample_file_name: Optional[str] = None @@ -79,13 +101,14 @@ def __init__(self, zipfile_path: Path): self.sample_file_buf: bytes = self.zipfile.read(sample_file_path, pwd=DEFAULT_ARCHIVE_PASSWORD) + # do not change order, it matters self._compute_base_address() self._compute_imports() self._compute_exports() self._compute_sections() - self._compute_process_ids() - self._compute_process_threads() - self._compute_process_calls() + self._compute_monitor_processes() + self._compute_monitor_threads() + self._compute_monitor_process_calls() def _find_sample_file(self): for file_name, file_analysis in self.sv2.files.items(): @@ -128,34 +151,48 @@ def _compute_sections(self): for elffile_section in self.sample_file_static_data.elf.sections: self.sections[elffile_section.header.sh_addr] = elffile_section.header.sh_name - def _compute_process_ids(self): + def _compute_monitor_processes(self): for process in self.sv2.processes.values(): - # we expect VMRay's monitor IDs to be unique, but OS PIDs may be reused - assert process.monitor_id not in self.process_ids.keys() - self.process_ids[process.monitor_id] = process.os_pid + # we expect monitor IDs to be unique + assert process.monitor_id not in self.monitor_processes - def _compute_process_threads(self): - # logs/flog.xml appears to be the only file that contains thread-related data - # so we use it here to map processes to threads - for function_call in self.flog.analysis.function_calls: - pid: int = self.get_process_os_pid(function_call.process_id) # flog.xml uses process monitor ID, not OS PID - tid: int = function_call.thread_id + ppid: int = ( + self.sv2.processes[process.ref_parent_process.path[1]].os_pid if process.ref_parent_process else 0 + ) + self.monitor_processes[process.monitor_id] = VMRayMonitorProcess( + process.os_pid, ppid, process.monitor_id, process.image_name + ) - assert isinstance(pid, int) - assert isinstance(tid, int) + # not all processes are recorded in SummaryV2.json, get missing data from flog.xml, see #2394 + for monitor_process in self.flog.analysis.monitor_processes: + vmray_monitor_process: VMRayMonitorProcess = VMRayMonitorProcess( + monitor_process.os_pid, + monitor_process.os_parent_pid, + monitor_process.process_id, + monitor_process.image_name, + ) - if tid not in self.process_threads[pid]: - self.process_threads[pid].append(tid) + if monitor_process.process_id not in self.monitor_processes: + self.monitor_processes[monitor_process.process_id] = vmray_monitor_process + else: + # we expect monitor processes recorded in both SummaryV2.json and flog.xml to equal + assert self.monitor_processes[monitor_process.process_id] == vmray_monitor_process - def _compute_process_calls(self): - for function_call in self.flog.analysis.function_calls: - pid: int = self.get_process_os_pid(function_call.process_id) # flog.xml uses process monitor ID, not OS PID - tid: int = function_call.thread_id + def _compute_monitor_threads(self): + for monitor_thread in self.flog.analysis.monitor_threads: + # we expect monitor IDs to be unique + assert monitor_thread.thread_id not in self.monitor_threads - assert isinstance(pid, int) - assert isinstance(tid, int) + self.monitor_threads[monitor_thread.thread_id] = VMRayMonitorThread( + monitor_thread.os_tid, monitor_thread.thread_id, monitor_thread.process_id + ) + + # we expect each monitor thread ID to be unique for its associated monitor process ID e.g. monitor + # thread ID 10 should not be captured twice for monitor process ID 1 + assert monitor_thread.thread_id not in self.monitor_threads_by_monitor_process[monitor_thread.thread_id] - self.process_calls[pid][tid].append(function_call) + self.monitor_threads_by_monitor_process[monitor_thread.process_id].append(monitor_thread.thread_id) - def get_process_os_pid(self, monitor_id: int) -> int: - return self.process_ids[monitor_id] + def _compute_monitor_process_calls(self): + for function_call in self.flog.analysis.function_calls: + self.monitor_process_calls[function_call.process_id][function_call.thread_id].append(function_call) diff --git a/capa/features/extractors/vmray/extractor.py b/capa/features/extractors/vmray/extractor.py index 735c646b9..36a0b430f 100644 --- a/capa/features/extractors/vmray/extractor.py +++ b/capa/features/extractors/vmray/extractor.py @@ -15,9 +15,16 @@ import capa.features.extractors.vmray.file import capa.features.extractors.vmray.global_ from capa.features.common import Feature, Characteristic -from capa.features.address import NO_ADDRESS, Address, ThreadAddress, DynamicCallAddress, AbsoluteVirtualAddress -from capa.features.extractors.vmray import VMRayAnalysis -from capa.features.extractors.vmray.models import PARAM_TYPE_STR, Process, ParamList, FunctionCall +from capa.features.address import ( + NO_ADDRESS, + Address, + ThreadAddress, + ProcessAddress, + DynamicCallAddress, + AbsoluteVirtualAddress, +) +from capa.features.extractors.vmray import VMRayAnalysis, VMRayMonitorThread, VMRayMonitorProcess +from capa.features.extractors.vmray.models import PARAM_TYPE_STR, ParamList, FunctionCall from capa.features.extractors.base_extractor import ( CallHandle, SampleHashes, @@ -69,20 +76,24 @@ def extract_global_features(self) -> Iterator[Tuple[Feature, Address]]: yield from self.global_features def get_processes(self) -> Iterator[ProcessHandle]: - yield from capa.features.extractors.vmray.file.get_processes(self.analysis) + for monitor_process in self.analysis.monitor_processes.values(): + address: ProcessAddress = ProcessAddress(pid=monitor_process.pid, ppid=monitor_process.ppid) + yield ProcessHandle(address, inner=monitor_process) def extract_process_features(self, ph: ProcessHandle) -> Iterator[Tuple[Feature, Address]]: # we have not identified process-specific features for VMRay yet yield from [] def get_process_name(self, ph) -> str: - process: Process = ph.inner - return process.image_name + monitor_process: VMRayMonitorProcess = ph.inner + return monitor_process.image_name def get_threads(self, ph: ProcessHandle) -> Iterator[ThreadHandle]: - for thread in self.analysis.process_threads[ph.address.pid]: - address: ThreadAddress = ThreadAddress(process=ph.address, tid=thread) - yield ThreadHandle(address=address, inner={}) + for monitor_thread_id in self.analysis.monitor_threads_by_monitor_process[ph.inner.monitor_id]: + monitor_thread: VMRayMonitorThread = self.analysis.monitor_threads[monitor_thread_id] + + address: ThreadAddress = ThreadAddress(process=ph.address, tid=monitor_thread.tid) + yield ThreadHandle(address=address, inner=monitor_thread) def extract_thread_features(self, ph: ProcessHandle, th: ThreadHandle) -> Iterator[Tuple[Feature, Address]]: if False: @@ -92,7 +103,7 @@ def extract_thread_features(self, ph: ProcessHandle, th: ThreadHandle) -> Iterat return def get_calls(self, ph: ProcessHandle, th: ThreadHandle) -> Iterator[CallHandle]: - for function_call in self.analysis.process_calls[ph.address.pid][th.address.tid]: + for function_call in self.analysis.monitor_process_calls[ph.inner.monitor_id][th.inner.monitor_id]: addr = DynamicCallAddress(thread=th.address, id=function_call.fncall_id) yield CallHandle(address=addr, inner=function_call) diff --git a/capa/features/extractors/vmray/file.py b/capa/features/extractors/vmray/file.py index 38ac9db01..7f4ba0395 100644 --- a/capa/features/extractors/vmray/file.py +++ b/capa/features/extractors/vmray/file.py @@ -6,37 +6,18 @@ # is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and limitations under the License. import logging -from typing import Dict, Tuple, Iterator +from typing import Tuple, Iterator import capa.features.extractors.common from capa.features.file import Export, Import, Section from capa.features.common import String, Feature -from capa.features.address import NO_ADDRESS, Address, ProcessAddress, AbsoluteVirtualAddress +from capa.features.address import NO_ADDRESS, Address, AbsoluteVirtualAddress from capa.features.extractors.vmray import VMRayAnalysis from capa.features.extractors.helpers import generate_symbols -from capa.features.extractors.vmray.models import Process -from capa.features.extractors.base_extractor import ProcessHandle logger = logging.getLogger(__name__) -def get_processes(analysis: VMRayAnalysis) -> Iterator[ProcessHandle]: - processes: Dict[str, Process] = analysis.sv2.processes - - for process in processes.values(): - # we map VMRay's monitor ID to the OS PID to make it easier for users - # to follow the processes in capa's output - pid: int = analysis.get_process_os_pid(process.monitor_id) - ppid: int = ( - analysis.get_process_os_pid(processes[process.ref_parent_process.path[1]].monitor_id) - if process.ref_parent_process - else 0 - ) - - addr: ProcessAddress = ProcessAddress(pid=pid, ppid=ppid) - yield ProcessHandle(address=addr, inner=process) - - def extract_export_names(analysis: VMRayAnalysis) -> Iterator[Tuple[Feature, Address]]: for addr, name in analysis.exports.items(): yield Export(name), AbsoluteVirtualAddress(addr) diff --git a/capa/features/extractors/vmray/models.py b/capa/features/extractors/vmray/models.py index a599dc420..f5371bec1 100644 --- a/capa/features/extractors/vmray/models.py +++ b/capa/features/extractors/vmray/models.py @@ -87,7 +87,7 @@ class Param(BaseModel): deref: Optional[ParamDeref] = None -def validate_param_list(value: Union[List[Param], Param]) -> List[Param]: +def validate_ensure_is_list(value: Union[List[Param], Param]) -> List[Param]: if isinstance(value, list): return value else: @@ -97,7 +97,7 @@ def validate_param_list(value: Union[List[Param], Param]) -> List[Param]: # params may be stored as a list of Param or a single Param so we convert # the input value to Python list type before the inner validation (List[Param]) # is called -ParamList = Annotated[List[Param], BeforeValidator(validate_param_list)] +ParamList = Annotated[List[Param], BeforeValidator(validate_ensure_is_list)] class Params(BaseModel): @@ -137,12 +137,46 @@ class FunctionReturn(BaseModel): from_addr: HexInt = Field(alias="from") +class MonitorProcess(BaseModel): + ts: HexInt + process_id: int + image_name: str + filename: str + # page_root: HexInt + os_pid: HexInt + # os_integrity_level: HexInt + # os_privileges: HexInt + monitor_reason: str + parent_id: int + os_parent_pid: HexInt + # cmd_line: str + # cur_dir: str + # os_username: str + # bitness: int + # os_groups: str + + +class MonitorThread(BaseModel): + ts: HexInt + thread_id: int + process_id: int + os_tid: HexInt + + +# handle if there's only single entries, but the model expects a list +MonitorProcessList = Annotated[List[MonitorProcess], BeforeValidator(validate_ensure_is_list)] +MonitorThreadList = Annotated[List[MonitorThread], BeforeValidator(validate_ensure_is_list)] +FunctionCallList = Annotated[List[FunctionCall], BeforeValidator(validate_ensure_is_list)] + + class Analysis(BaseModel): log_version: str # tested 2 analyzer_version: str # tested 2024.2.1 # analysis_date: str - function_calls: List[FunctionCall] = Field(alias="fncall", default=[]) + monitor_processes: MonitorProcessList = Field(alias="monitor_process", default=[]) + monitor_threads: MonitorThreadList = Field(alias="monitor_thread", default=[]) + function_calls: FunctionCallList = Field(alias="fncall", default=[]) # function_returns: List[FunctionReturn] = Field(alias="fnret", default=[]) diff --git a/tests/fixtures.py b/tests/fixtures.py index e4d0a6fa0..1912a456a 100644 --- a/tests/fixtures.py +++ b/tests/fixtures.py @@ -431,6 +431,14 @@ def get_data_path_by_name(name) -> Path: / "vmray" / "93b2d1840566f45fab674ebc79a9d19c88993bcb645e0357f3cb584d16e7c795_min_archive.zip" ) + elif name.startswith("2f8a79-vmray"): + return ( + CD + / "data" + / "dynamic" + / "vmray" + / "2f8a79b12a7a989ac7e5f6ec65050036588a92e65aeb6841e08dc228ff0e21b4_min_archive.zip" + ) elif name.startswith("ea2876"): return CD / "data" / "ea2876e9175410b6f6719f80ee44b9553960758c7d0f7bed73c0fe9a78d8e669.dll_" elif name.startswith("1038a2"): diff --git a/tests/test_vmray_features.py b/tests/test_vmray_features.py index d92a75e49..02eb683ec 100644 --- a/tests/test_vmray_features.py +++ b/tests/test_vmray_features.py @@ -20,21 +20,21 @@ # file/imports ("93b2d1-vmray", "file", capa.features.file.Import("GetAddrInfoW"), True), # thread/api calls - ("93b2d1-vmray", "process=(2176:0),thread=7", capa.features.insn.API("GetAddrInfoW"), True), - ("93b2d1-vmray", "process=(2176:0),thread=7", capa.features.insn.API("DoesNotExist"), False), + ("93b2d1-vmray", "process=(2176:0),thread=2420", capa.features.insn.API("GetAddrInfoW"), True), + ("93b2d1-vmray", "process=(2176:0),thread=2420", capa.features.insn.API("DoesNotExist"), False), # call/api - ("93b2d1-vmray", "process=(2176:0),thread=7,call=2361", capa.features.insn.API("GetAddrInfoW"), True), + ("93b2d1-vmray", "process=(2176:0),thread=2420,call=2361", capa.features.insn.API("GetAddrInfoW"), True), # call/string argument ( "93b2d1-vmray", - "process=(2176:0),thread=7,call=10323", + "process=(2176:0),thread=2420,call=10323", capa.features.common.String("raw.githubusercontent.com"), True, ), # call/number argument # VirtualAlloc(4096, 4) - ("93b2d1-vmray", "process=(2176:0),thread=7,call=2358", capa.features.insn.Number(4096), True), - ("93b2d1-vmray", "process=(2176:0),thread=7,call=2358", capa.features.insn.Number(4), True), + ("93b2d1-vmray", "process=(2176:0),thread=2420,call=2358", capa.features.insn.Number(4096), True), + ("93b2d1-vmray", "process=(2176:0),thread=2420,call=2358", capa.features.insn.Number(4), True), ], # order tests by (file, item) # so that our LRU cache is most effective. @@ -46,24 +46,24 @@ # file/imports ("93b2d1-vmray", "file", capa.features.file.Import("GetAddrInfoW"), 1), # thread/api calls - ("93b2d1-vmray", "process=(2176:0),thread=7", capa.features.insn.API("free"), 1), - ("93b2d1-vmray", "process=(2176:0),thread=7", capa.features.insn.API("GetAddrInfoW"), 5), + ("93b2d1-vmray", "process=(2176:0),thread=2420", capa.features.insn.API("free"), 1), + ("93b2d1-vmray", "process=(2176:0),thread=2420", capa.features.insn.API("GetAddrInfoW"), 5), # call/api - ("93b2d1-vmray", "process=(2176:0),thread=7,call=2345", capa.features.insn.API("free"), 1), - ("93b2d1-vmray", "process=(2176:0),thread=7,call=2345", capa.features.insn.API("GetAddrInfoW"), 0), - ("93b2d1-vmray", "process=(2176:0),thread=7,call=2361", capa.features.insn.API("GetAddrInfoW"), 1), + ("93b2d1-vmray", "process=(2176:0),thread=2420,call=2345", capa.features.insn.API("free"), 1), + ("93b2d1-vmray", "process=(2176:0),thread=2420,call=2345", capa.features.insn.API("GetAddrInfoW"), 0), + ("93b2d1-vmray", "process=(2176:0),thread=2420,call=2361", capa.features.insn.API("GetAddrInfoW"), 1), # call/string argument ( "93b2d1-vmray", - "process=(2176:0),thread=7,call=10323", + "process=(2176:0),thread=2420,call=10323", capa.features.common.String("raw.githubusercontent.com"), 1, ), - ("93b2d1-vmray", "process=(2176:0),thread=7,call=10323", capa.features.common.String("non_existant"), 0), + ("93b2d1-vmray", "process=(2176:0),thread=2420,call=10323", capa.features.common.String("non_existant"), 0), # call/number argument - ("93b2d1-vmray", "process=(2176:0),thread=7,call=10315", capa.features.insn.Number(4096), 1), - ("93b2d1-vmray", "process=(2176:0),thread=7,call=10315", capa.features.insn.Number(4), 1), - ("93b2d1-vmray", "process=(2176:0),thread=7,call=10315", capa.features.insn.Number(404), 0), + ("93b2d1-vmray", "process=(2176:0),thread=2420,call=10315", capa.features.insn.Number(4096), 1), + ("93b2d1-vmray", "process=(2176:0),thread=2420,call=10315", capa.features.insn.Number(4), 1), + ("93b2d1-vmray", "process=(2176:0),thread=2420,call=10315", capa.features.insn.Number(404), 0), ], # order tests by (file, item) # so that our LRU cache is most effective. @@ -87,3 +87,10 @@ def test_vmray_features(sample, scope, feature, expected): ) def test_vmray_feature_counts(sample, scope, feature, expected): fixtures.do_test_feature_count(fixtures.get_vmray_extractor, sample, scope, feature, expected) + + +def test_vmray_processes(): + # see #2394 + path = fixtures.get_data_path_by_name("2f8a79-vmray") + vmre = fixtures.get_vmray_extractor(path) + assert len(vmre.analysis.monitor_processes) == 9