From 06271a88d4ad17c6502e76fd168e316442b29634 Mon Sep 17 00:00:00 2001
From: Moritz <mr-tz@users.noreply.github.com>
Date: Thu, 26 Sep 2024 21:57:30 +0200
Subject: [PATCH] Fix VMRay missing process data (#2396)

* get all processes, see #2394

* add tests for process recording

* rename symbols for clarification

* handle single and list entries

* update changelog

* dynamic: vmray: use monitor IDs to track processes and threads

* dynamic: vmray: code refactor

* dynamic: vmray: add sanity checks when processing monitor processes

* dynamic: vmray: remove unnecessary keys() access

* dynamic: vmray: clarify comments

* Update CHANGELOG.md

Co-authored-by: Willi Ballenthin <wballenthin@google.com>

* dynamic: vmray: update CHANGELOG

---------

Co-authored-by: Mike Hunhoff <mike.hunhoff@gmail.com>
Co-authored-by: Willi Ballenthin <wballenthin@google.com>
---
 CHANGELOG.md                                |  1 +
 capa/features/extractors/vmray/__init__.py  | 95 ++++++++++++++-------
 capa/features/extractors/vmray/extractor.py | 31 ++++---
 capa/features/extractors/vmray/file.py      | 23 +----
 capa/features/extractors/vmray/models.py    | 40 ++++++++-
 tests/fixtures.py                           |  8 ++
 tests/test_vmray_features.py                | 39 +++++----
 7 files changed, 158 insertions(+), 79 deletions(-)

diff --git a/CHANGELOG.md b/CHANGELOG.md
index 6cd8b487c..2edd08c5c 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -26,6 +26,7 @@
 
 - use Python 3.12 to build extra standalone build on Linux #2383 @williballenthin
 - bump minimum Python version to 3.8.1 to satisfy uv #2387 @williballenthin
+- vmray: collect more process information from flog.xml #2394 @mr-tz @mike-hunhoff
 
 ### capa explorer IDA Pro plugin
 
diff --git a/capa/features/extractors/vmray/__init__.py b/capa/features/extractors/vmray/__init__.py
index 06d581cc9..4a004af61 100644
--- a/capa/features/extractors/vmray/__init__.py
+++ b/capa/features/extractors/vmray/__init__.py
@@ -10,6 +10,7 @@
 from pathlib import Path
 from zipfile import ZipFile
 from collections import defaultdict
+from dataclasses import dataclass
 
 from capa.exceptions import UnsupportedFormatError
 from capa.features.extractors.vmray.models import File, Flog, SummaryV2, StaticData, FunctionCall, xml_to_dict
@@ -21,6 +22,21 @@
 SUPPORTED_FLOG_VERSIONS = ("2",)
 
 
+@dataclass
+class VMRayMonitorThread:
+    tid: int  # thread ID assigned by OS
+    monitor_id: int  # unique ID assigned to thread by VMRay
+    process_monitor_id: int  # unqiue ID assigned to containing process by VMRay
+
+
+@dataclass
+class VMRayMonitorProcess:
+    pid: int  # process ID assigned by OS
+    ppid: int  # parent process ID assigned by OS
+    monitor_id: int  # unique ID assigned to process by VMRay
+    image_name: str
+
+
 class VMRayAnalysis:
     def __init__(self, zipfile_path: Path):
         self.zipfile = ZipFile(zipfile_path, "r")
@@ -45,9 +61,15 @@ def __init__(self, zipfile_path: Path):
         self.exports: Dict[int, str] = {}
         self.imports: Dict[int, Tuple[str, str]] = {}
         self.sections: Dict[int, str] = {}
-        self.process_ids: Dict[int, int] = {}
-        self.process_threads: Dict[int, List[int]] = defaultdict(list)
-        self.process_calls: Dict[int, Dict[int, List[FunctionCall]]] = defaultdict(lambda: defaultdict(list))
+        self.monitor_processes: Dict[int, VMRayMonitorProcess] = {}
+        self.monitor_threads: Dict[int, VMRayMonitorThread] = {}
+
+        # map monitor thread IDs to their associated monitor process ID
+        self.monitor_threads_by_monitor_process: Dict[int, List[int]] = defaultdict(list)
+
+        # map function calls to their associated monitor thread ID mapped to its associated monitor process ID
+        self.monitor_process_calls: Dict[int, Dict[int, List[FunctionCall]]] = defaultdict(lambda: defaultdict(list))
+
         self.base_address: int
 
         self.sample_file_name: Optional[str] = None
@@ -79,13 +101,14 @@ def __init__(self, zipfile_path: Path):
 
         self.sample_file_buf: bytes = self.zipfile.read(sample_file_path, pwd=DEFAULT_ARCHIVE_PASSWORD)
 
+        # do not change order, it matters
         self._compute_base_address()
         self._compute_imports()
         self._compute_exports()
         self._compute_sections()
-        self._compute_process_ids()
-        self._compute_process_threads()
-        self._compute_process_calls()
+        self._compute_monitor_processes()
+        self._compute_monitor_threads()
+        self._compute_monitor_process_calls()
 
     def _find_sample_file(self):
         for file_name, file_analysis in self.sv2.files.items():
@@ -128,34 +151,48 @@ def _compute_sections(self):
             for elffile_section in self.sample_file_static_data.elf.sections:
                 self.sections[elffile_section.header.sh_addr] = elffile_section.header.sh_name
 
-    def _compute_process_ids(self):
+    def _compute_monitor_processes(self):
         for process in self.sv2.processes.values():
-            # we expect VMRay's monitor IDs to be unique, but OS PIDs may be reused
-            assert process.monitor_id not in self.process_ids.keys()
-            self.process_ids[process.monitor_id] = process.os_pid
+            # we expect monitor IDs to be unique
+            assert process.monitor_id not in self.monitor_processes
 
-    def _compute_process_threads(self):
-        # logs/flog.xml appears to be the only file that contains thread-related data
-        # so we use it here to map processes to threads
-        for function_call in self.flog.analysis.function_calls:
-            pid: int = self.get_process_os_pid(function_call.process_id)  # flog.xml uses process monitor ID, not OS PID
-            tid: int = function_call.thread_id
+            ppid: int = (
+                self.sv2.processes[process.ref_parent_process.path[1]].os_pid if process.ref_parent_process else 0
+            )
+            self.monitor_processes[process.monitor_id] = VMRayMonitorProcess(
+                process.os_pid, ppid, process.monitor_id, process.image_name
+            )
 
-            assert isinstance(pid, int)
-            assert isinstance(tid, int)
+        # not all processes are recorded in SummaryV2.json, get missing data from flog.xml, see #2394
+        for monitor_process in self.flog.analysis.monitor_processes:
+            vmray_monitor_process: VMRayMonitorProcess = VMRayMonitorProcess(
+                monitor_process.os_pid,
+                monitor_process.os_parent_pid,
+                monitor_process.process_id,
+                monitor_process.image_name,
+            )
 
-            if tid not in self.process_threads[pid]:
-                self.process_threads[pid].append(tid)
+            if monitor_process.process_id not in self.monitor_processes:
+                self.monitor_processes[monitor_process.process_id] = vmray_monitor_process
+            else:
+                # we expect monitor processes recorded in both SummaryV2.json and flog.xml to equal
+                assert self.monitor_processes[monitor_process.process_id] == vmray_monitor_process
 
-    def _compute_process_calls(self):
-        for function_call in self.flog.analysis.function_calls:
-            pid: int = self.get_process_os_pid(function_call.process_id)  # flog.xml uses process monitor ID, not OS PID
-            tid: int = function_call.thread_id
+    def _compute_monitor_threads(self):
+        for monitor_thread in self.flog.analysis.monitor_threads:
+            # we expect monitor IDs to be unique
+            assert monitor_thread.thread_id not in self.monitor_threads
 
-            assert isinstance(pid, int)
-            assert isinstance(tid, int)
+            self.monitor_threads[monitor_thread.thread_id] = VMRayMonitorThread(
+                monitor_thread.os_tid, monitor_thread.thread_id, monitor_thread.process_id
+            )
+
+            # we expect each monitor thread ID to be unique for its associated monitor process ID e.g. monitor
+            # thread ID 10 should not be captured twice for monitor process ID 1
+            assert monitor_thread.thread_id not in self.monitor_threads_by_monitor_process[monitor_thread.thread_id]
 
-            self.process_calls[pid][tid].append(function_call)
+            self.monitor_threads_by_monitor_process[monitor_thread.process_id].append(monitor_thread.thread_id)
 
-    def get_process_os_pid(self, monitor_id: int) -> int:
-        return self.process_ids[monitor_id]
+    def _compute_monitor_process_calls(self):
+        for function_call in self.flog.analysis.function_calls:
+            self.monitor_process_calls[function_call.process_id][function_call.thread_id].append(function_call)
diff --git a/capa/features/extractors/vmray/extractor.py b/capa/features/extractors/vmray/extractor.py
index 735c646b9..36a0b430f 100644
--- a/capa/features/extractors/vmray/extractor.py
+++ b/capa/features/extractors/vmray/extractor.py
@@ -15,9 +15,16 @@
 import capa.features.extractors.vmray.file
 import capa.features.extractors.vmray.global_
 from capa.features.common import Feature, Characteristic
-from capa.features.address import NO_ADDRESS, Address, ThreadAddress, DynamicCallAddress, AbsoluteVirtualAddress
-from capa.features.extractors.vmray import VMRayAnalysis
-from capa.features.extractors.vmray.models import PARAM_TYPE_STR, Process, ParamList, FunctionCall
+from capa.features.address import (
+    NO_ADDRESS,
+    Address,
+    ThreadAddress,
+    ProcessAddress,
+    DynamicCallAddress,
+    AbsoluteVirtualAddress,
+)
+from capa.features.extractors.vmray import VMRayAnalysis, VMRayMonitorThread, VMRayMonitorProcess
+from capa.features.extractors.vmray.models import PARAM_TYPE_STR, ParamList, FunctionCall
 from capa.features.extractors.base_extractor import (
     CallHandle,
     SampleHashes,
@@ -69,20 +76,24 @@ def extract_global_features(self) -> Iterator[Tuple[Feature, Address]]:
         yield from self.global_features
 
     def get_processes(self) -> Iterator[ProcessHandle]:
-        yield from capa.features.extractors.vmray.file.get_processes(self.analysis)
+        for monitor_process in self.analysis.monitor_processes.values():
+            address: ProcessAddress = ProcessAddress(pid=monitor_process.pid, ppid=monitor_process.ppid)
+            yield ProcessHandle(address, inner=monitor_process)
 
     def extract_process_features(self, ph: ProcessHandle) -> Iterator[Tuple[Feature, Address]]:
         # we have not identified process-specific features for VMRay yet
         yield from []
 
     def get_process_name(self, ph) -> str:
-        process: Process = ph.inner
-        return process.image_name
+        monitor_process: VMRayMonitorProcess = ph.inner
+        return monitor_process.image_name
 
     def get_threads(self, ph: ProcessHandle) -> Iterator[ThreadHandle]:
-        for thread in self.analysis.process_threads[ph.address.pid]:
-            address: ThreadAddress = ThreadAddress(process=ph.address, tid=thread)
-            yield ThreadHandle(address=address, inner={})
+        for monitor_thread_id in self.analysis.monitor_threads_by_monitor_process[ph.inner.monitor_id]:
+            monitor_thread: VMRayMonitorThread = self.analysis.monitor_threads[monitor_thread_id]
+
+            address: ThreadAddress = ThreadAddress(process=ph.address, tid=monitor_thread.tid)
+            yield ThreadHandle(address=address, inner=monitor_thread)
 
     def extract_thread_features(self, ph: ProcessHandle, th: ThreadHandle) -> Iterator[Tuple[Feature, Address]]:
         if False:
@@ -92,7 +103,7 @@ def extract_thread_features(self, ph: ProcessHandle, th: ThreadHandle) -> Iterat
         return
 
     def get_calls(self, ph: ProcessHandle, th: ThreadHandle) -> Iterator[CallHandle]:
-        for function_call in self.analysis.process_calls[ph.address.pid][th.address.tid]:
+        for function_call in self.analysis.monitor_process_calls[ph.inner.monitor_id][th.inner.monitor_id]:
             addr = DynamicCallAddress(thread=th.address, id=function_call.fncall_id)
             yield CallHandle(address=addr, inner=function_call)
 
diff --git a/capa/features/extractors/vmray/file.py b/capa/features/extractors/vmray/file.py
index 38ac9db01..7f4ba0395 100644
--- a/capa/features/extractors/vmray/file.py
+++ b/capa/features/extractors/vmray/file.py
@@ -6,37 +6,18 @@
 #  is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and limitations under the License.
 import logging
-from typing import Dict, Tuple, Iterator
+from typing import Tuple, Iterator
 
 import capa.features.extractors.common
 from capa.features.file import Export, Import, Section
 from capa.features.common import String, Feature
-from capa.features.address import NO_ADDRESS, Address, ProcessAddress, AbsoluteVirtualAddress
+from capa.features.address import NO_ADDRESS, Address, AbsoluteVirtualAddress
 from capa.features.extractors.vmray import VMRayAnalysis
 from capa.features.extractors.helpers import generate_symbols
-from capa.features.extractors.vmray.models import Process
-from capa.features.extractors.base_extractor import ProcessHandle
 
 logger = logging.getLogger(__name__)
 
 
-def get_processes(analysis: VMRayAnalysis) -> Iterator[ProcessHandle]:
-    processes: Dict[str, Process] = analysis.sv2.processes
-
-    for process in processes.values():
-        # we map VMRay's monitor ID to the OS PID to make it easier for users
-        # to follow the processes in capa's output
-        pid: int = analysis.get_process_os_pid(process.monitor_id)
-        ppid: int = (
-            analysis.get_process_os_pid(processes[process.ref_parent_process.path[1]].monitor_id)
-            if process.ref_parent_process
-            else 0
-        )
-
-        addr: ProcessAddress = ProcessAddress(pid=pid, ppid=ppid)
-        yield ProcessHandle(address=addr, inner=process)
-
-
 def extract_export_names(analysis: VMRayAnalysis) -> Iterator[Tuple[Feature, Address]]:
     for addr, name in analysis.exports.items():
         yield Export(name), AbsoluteVirtualAddress(addr)
diff --git a/capa/features/extractors/vmray/models.py b/capa/features/extractors/vmray/models.py
index a599dc420..f5371bec1 100644
--- a/capa/features/extractors/vmray/models.py
+++ b/capa/features/extractors/vmray/models.py
@@ -87,7 +87,7 @@ class Param(BaseModel):
     deref: Optional[ParamDeref] = None
 
 
-def validate_param_list(value: Union[List[Param], Param]) -> List[Param]:
+def validate_ensure_is_list(value: Union[List[Param], Param]) -> List[Param]:
     if isinstance(value, list):
         return value
     else:
@@ -97,7 +97,7 @@ def validate_param_list(value: Union[List[Param], Param]) -> List[Param]:
 # params may be stored as a list of Param or a single Param so we convert
 # the input value to Python list type before the inner validation (List[Param])
 # is called
-ParamList = Annotated[List[Param], BeforeValidator(validate_param_list)]
+ParamList = Annotated[List[Param], BeforeValidator(validate_ensure_is_list)]
 
 
 class Params(BaseModel):
@@ -137,12 +137,46 @@ class FunctionReturn(BaseModel):
     from_addr: HexInt = Field(alias="from")
 
 
+class MonitorProcess(BaseModel):
+    ts: HexInt
+    process_id: int
+    image_name: str
+    filename: str
+    # page_root: HexInt
+    os_pid: HexInt
+    # os_integrity_level: HexInt
+    # os_privileges: HexInt
+    monitor_reason: str
+    parent_id: int
+    os_parent_pid: HexInt
+    # cmd_line: str
+    # cur_dir: str
+    # os_username: str
+    # bitness: int
+    # os_groups: str
+
+
+class MonitorThread(BaseModel):
+    ts: HexInt
+    thread_id: int
+    process_id: int
+    os_tid: HexInt
+
+
+# handle if there's only single entries, but the model expects a list
+MonitorProcessList = Annotated[List[MonitorProcess], BeforeValidator(validate_ensure_is_list)]
+MonitorThreadList = Annotated[List[MonitorThread], BeforeValidator(validate_ensure_is_list)]
+FunctionCallList = Annotated[List[FunctionCall], BeforeValidator(validate_ensure_is_list)]
+
+
 class Analysis(BaseModel):
     log_version: str  # tested 2
     analyzer_version: str  # tested 2024.2.1
     # analysis_date: str
 
-    function_calls: List[FunctionCall] = Field(alias="fncall", default=[])
+    monitor_processes: MonitorProcessList = Field(alias="monitor_process", default=[])
+    monitor_threads: MonitorThreadList = Field(alias="monitor_thread", default=[])
+    function_calls: FunctionCallList = Field(alias="fncall", default=[])
     # function_returns: List[FunctionReturn] = Field(alias="fnret", default=[])
 
 
diff --git a/tests/fixtures.py b/tests/fixtures.py
index e4d0a6fa0..1912a456a 100644
--- a/tests/fixtures.py
+++ b/tests/fixtures.py
@@ -431,6 +431,14 @@ def get_data_path_by_name(name) -> Path:
             / "vmray"
             / "93b2d1840566f45fab674ebc79a9d19c88993bcb645e0357f3cb584d16e7c795_min_archive.zip"
         )
+    elif name.startswith("2f8a79-vmray"):
+        return (
+            CD
+            / "data"
+            / "dynamic"
+            / "vmray"
+            / "2f8a79b12a7a989ac7e5f6ec65050036588a92e65aeb6841e08dc228ff0e21b4_min_archive.zip"
+        )
     elif name.startswith("ea2876"):
         return CD / "data" / "ea2876e9175410b6f6719f80ee44b9553960758c7d0f7bed73c0fe9a78d8e669.dll_"
     elif name.startswith("1038a2"):
diff --git a/tests/test_vmray_features.py b/tests/test_vmray_features.py
index d92a75e49..02eb683ec 100644
--- a/tests/test_vmray_features.py
+++ b/tests/test_vmray_features.py
@@ -20,21 +20,21 @@
         # file/imports
         ("93b2d1-vmray", "file", capa.features.file.Import("GetAddrInfoW"), True),
         # thread/api calls
-        ("93b2d1-vmray", "process=(2176:0),thread=7", capa.features.insn.API("GetAddrInfoW"), True),
-        ("93b2d1-vmray", "process=(2176:0),thread=7", capa.features.insn.API("DoesNotExist"), False),
+        ("93b2d1-vmray", "process=(2176:0),thread=2420", capa.features.insn.API("GetAddrInfoW"), True),
+        ("93b2d1-vmray", "process=(2176:0),thread=2420", capa.features.insn.API("DoesNotExist"), False),
         # call/api
-        ("93b2d1-vmray", "process=(2176:0),thread=7,call=2361", capa.features.insn.API("GetAddrInfoW"), True),
+        ("93b2d1-vmray", "process=(2176:0),thread=2420,call=2361", capa.features.insn.API("GetAddrInfoW"), True),
         # call/string argument
         (
             "93b2d1-vmray",
-            "process=(2176:0),thread=7,call=10323",
+            "process=(2176:0),thread=2420,call=10323",
             capa.features.common.String("raw.githubusercontent.com"),
             True,
         ),
         # call/number argument
         # VirtualAlloc(4096, 4)
-        ("93b2d1-vmray", "process=(2176:0),thread=7,call=2358", capa.features.insn.Number(4096), True),
-        ("93b2d1-vmray", "process=(2176:0),thread=7,call=2358", capa.features.insn.Number(4), True),
+        ("93b2d1-vmray", "process=(2176:0),thread=2420,call=2358", capa.features.insn.Number(4096), True),
+        ("93b2d1-vmray", "process=(2176:0),thread=2420,call=2358", capa.features.insn.Number(4), True),
     ],
     # order tests by (file, item)
     # so that our LRU cache is most effective.
@@ -46,24 +46,24 @@
         # file/imports
         ("93b2d1-vmray", "file", capa.features.file.Import("GetAddrInfoW"), 1),
         # thread/api calls
-        ("93b2d1-vmray", "process=(2176:0),thread=7", capa.features.insn.API("free"), 1),
-        ("93b2d1-vmray", "process=(2176:0),thread=7", capa.features.insn.API("GetAddrInfoW"), 5),
+        ("93b2d1-vmray", "process=(2176:0),thread=2420", capa.features.insn.API("free"), 1),
+        ("93b2d1-vmray", "process=(2176:0),thread=2420", capa.features.insn.API("GetAddrInfoW"), 5),
         # call/api
-        ("93b2d1-vmray", "process=(2176:0),thread=7,call=2345", capa.features.insn.API("free"), 1),
-        ("93b2d1-vmray", "process=(2176:0),thread=7,call=2345", capa.features.insn.API("GetAddrInfoW"), 0),
-        ("93b2d1-vmray", "process=(2176:0),thread=7,call=2361", capa.features.insn.API("GetAddrInfoW"), 1),
+        ("93b2d1-vmray", "process=(2176:0),thread=2420,call=2345", capa.features.insn.API("free"), 1),
+        ("93b2d1-vmray", "process=(2176:0),thread=2420,call=2345", capa.features.insn.API("GetAddrInfoW"), 0),
+        ("93b2d1-vmray", "process=(2176:0),thread=2420,call=2361", capa.features.insn.API("GetAddrInfoW"), 1),
         # call/string argument
         (
             "93b2d1-vmray",
-            "process=(2176:0),thread=7,call=10323",
+            "process=(2176:0),thread=2420,call=10323",
             capa.features.common.String("raw.githubusercontent.com"),
             1,
         ),
-        ("93b2d1-vmray", "process=(2176:0),thread=7,call=10323", capa.features.common.String("non_existant"), 0),
+        ("93b2d1-vmray", "process=(2176:0),thread=2420,call=10323", capa.features.common.String("non_existant"), 0),
         # call/number argument
-        ("93b2d1-vmray", "process=(2176:0),thread=7,call=10315", capa.features.insn.Number(4096), 1),
-        ("93b2d1-vmray", "process=(2176:0),thread=7,call=10315", capa.features.insn.Number(4), 1),
-        ("93b2d1-vmray", "process=(2176:0),thread=7,call=10315", capa.features.insn.Number(404), 0),
+        ("93b2d1-vmray", "process=(2176:0),thread=2420,call=10315", capa.features.insn.Number(4096), 1),
+        ("93b2d1-vmray", "process=(2176:0),thread=2420,call=10315", capa.features.insn.Number(4), 1),
+        ("93b2d1-vmray", "process=(2176:0),thread=2420,call=10315", capa.features.insn.Number(404), 0),
     ],
     # order tests by (file, item)
     # so that our LRU cache is most effective.
@@ -87,3 +87,10 @@ def test_vmray_features(sample, scope, feature, expected):
 )
 def test_vmray_feature_counts(sample, scope, feature, expected):
     fixtures.do_test_feature_count(fixtures.get_vmray_extractor, sample, scope, feature, expected)
+
+
+def test_vmray_processes():
+    # see #2394
+    path = fixtures.get_data_path_by_name("2f8a79-vmray")
+    vmre = fixtures.get_vmray_extractor(path)
+    assert len(vmre.analysis.monitor_processes) == 9