result_summary: cache fetched nodes to reduce the amount of queries

The bulk of the time is spent on API requests to search for nodes. Use a node cache to reduce the amount of requests. Signed-off-by: Ricardo Cañuelo <[email protected]>
kernelci · Oct 8, 2024 · 2854d2c · 2854d2c
1 parent 121c428
commit 2854d2c
Show file tree

Hide file tree

Showing 2 changed files with 29 additions and 5 deletions.
diff --git a/src/result_summary/summary.py b/src/result_summary/summary.py
@@ -66,6 +66,7 @@ def run(service, context):
         result_summary.logger.debug(f"Query matches found: {len(query_results)}")
         nodes.extend(query_results)
     result_summary.logger.info(f"Total nodes found: {len(nodes)}")
+    utils.node_cache_write(nodes)
 
     # Post-process nodes
     # Filter log files

diff --git a/src/result_summary/utils.py b/src/result_summary/utils.py
@@ -10,13 +10,32 @@
 import requests
 import yaml
 from typing import Any, Dict
+from threading import Lock
 
 import result_summary
 
 
 CONFIG_TRACES_FILE_PATH = './config/traces_config.yaml'
 LAVA_JOB_URL = 'https://lava.collabora.dev/scheduler/job/'
 
+node_cache = {}
+node_cache_lock = Lock()
+
+def node_cache_write(nodes):
+    global node_cache
+    with node_cache_lock:
+        for node in nodes:
+            if node['id'] not in node_cache:
+                node_cache[node['id']] = node
+
+def node_cache_read(id):
+    return None
+    global node_cache
+    node = None
+    with node_cache_lock:
+        node = node_cache.get(id)
+    return node
+
 
 def split_query_params(query_string):
     """Given a string input formatted like this:
@@ -213,16 +232,20 @@ def post_process_node(node, api):
         key 'logs', which contains a dictionary of processed log
         data (see get_logs()).
     """
-    node_cache = {}
 
     def get_parent(node, api):
-        nonlocal node_cache
+        """Fetches and returns a node parent, if it exists. Uses and
+        updates the node cache in the process"""
         parent_id = node.get('parent')
         if parent_id:
-            if parent_id in node_cache:
-                return node_cache[parent_id]
+            parent = node_cache_read(parent_id)
+            if parent:
+                return parent
             else:
-                return api.node.get(node['parent'])
+                parent = api.node.get(node['parent'])
+                if parent:
+                    node_cache_write([parent])
+                return parent
         return None
 
     def get_job_id(node, api):