medic · njuguna-n · Sep 25, 2024 · Sep 27, 2024 · Sep 30, 2024 · Oct 4, 2024
diff --git a/dbt/dbt-run.py b/dbt/dbt-run.py
@@ -42,6 +42,18 @@ def setup():
           """)
       conn.commit()
 
+  with connection() as conn:
+      with conn.cursor() as cur:
+          cur.execute(f"""
+              CREATE TABLE IF NOT EXISTS
+              {os.getenv('POSTGRES_SCHEMA')}.dbt_batch_status (
+                  id SERIAL PRIMARY KEY,
+                  timestamp TIMESTAMP,
+                  status TEXT
+              )
+          """)
+      conn.commit()
+
 def get_package():
   package_json = '{}'
 
@@ -107,8 +119,7 @@ def save_package_manifest(package_json, manifest_json):
       )
       conn.commit()
 
-
-def update_models():
+def update_dbt_deps():
   # install the cht pipeline package
   package_json = get_package()
   subprocess.run(["dbt", "deps", "--profiles-dir", ".dbt", "--upgrade"])
@@ -119,6 +130,8 @@ def update_models():
   # save the new manifest and package for the next run
   save_package_manifest(package_json, manifest_json)
 
+def update_models():
+  update_dbt_deps()
   # anything that changed, run a full refresh
   subprocess.run(["dbt", "run",
     "--profiles-dir",
@@ -133,10 +146,113 @@ def run_incremental_models():
   # update incremental models (and tables if there are any)
   subprocess.run(["dbt", "run",  "--profiles-dir", ".dbt", "--exclude", "config.materialized:view"])
 
+def get_pending_doc_count():
+  with connection() as conn:
+      with conn.cursor() as cur:
+          cur.execute(f"""
+              SELECT SUM(pending)
+              FROM {os.getenv('POSTGRES_SCHEMA')}.couchdb_progress
+          """)
+          return cur.fetchone()[0]
+
+def get_batch_ranges():
+  with connection() as conn:
+      with conn.cursor() as cur:
+          cur.execute(f"""
+              SELECT
+                  MIN(saved_timestamp) as start_timestamp,
+                  MAX(saved_timestamp) as end_timestamp
+              FROM {os.getenv('POSTGRES_SCHEMA')}.{os.getenv('POSTGRES_TABLE')}
+          """)
+          result = cur.fetchone()
+          if result is None or len(result) == 0:
+            return []
+
+          start_timestamp, end_timestamp = result
+          start_timestamp = int(start_timestamp.timestamp())
+          end_timestamp = int(end_timestamp.timestamp())
+          batch_size = int(os.getenv("DBT_BATCH_SIZE") or 10000)
+          return [(start, min(start + batch_size, end_timestamp)) for start in range(start_timestamp, end_timestamp, batch_size)]
+
+def update_batch_status(timestamp, status):
+  with connection() as conn:
+    with conn.cursor() as cur:
+      # insert new entry
+      cur.execute(
+        f"INSERT INTO {os.getenv('POSTGRES_SCHEMA')}.dbt_batch_status (timestamp, status) VALUES (%s, %s);", [timestamp, status]
+      )
+      conn.commit()
+
+def get_last_processed_timestamp():
+  with connection() as conn:
+    with conn.cursor() as cur:
+      cur.execute(f"""
+          SELECT MAX(timestamp)
+          FROM {os.getenv('POSTGRES_SCHEMA')}.dbt_batch_status
+          WHERE status = 'success'
+      """)
+      result = cur.fetchone()
+      if result and result[0]:
+        return result[0]
+      return '1970-01-01 00:00:00'
+
+def get_max_timestamp():
+  with connection() as conn:
+    with conn.cursor() as cur:
+      cur.execute(f"""
+          SELECT MAX(saved_timestamp)
+          FROM {os.getenv('POSTGRES_SCHEMA')}.document_metadata
+      """)
+      return cur.fetchone()[0]
+
+def run_dbt_in_batches():
+  print("Running dbt in batches")
+  last_processed_timestamp = get_last_processed_timestamp()
+  batch_size = int(os.getenv("DBT_BATCH_SIZE") or 10000)
+  print(f"Starting new batch with timestamp: {last_processed_timestamp}")
+
+  while True:
+     print(f"Starting new batch with timestamp: {last_processed_timestamp}")
+     update_dbt_deps()
+     result = subprocess.run([
+        "dbt", "run",
+        "--profiles-dir", ".dbt",
+        "--vars", f'{{start_timestamp: "{last_processed_timestamp}", batch_size: {batch_size}}}'
+      ])
+
+     if result.returncode != 0:
+       print("Error running dbt")
+       update_batch_status(last_processed_timestamp, "error")
+       time.sleep(int(os.getenv("DATAEMON_INTERVAL") or 5))
+       continue
+
+     update_batch_status(last_processed_timestamp, "success")
+     max_timestamp = get_max_timestamp()
+
+     if max_timestamp == last_processed_timestamp:
+       print("Finished processing all batches")
+       break
+
+     last_processed_timestamp = max_timestamp
+
+  run_dbt()
+
+def run_dbt():
+  print("Starting regular dbt run")
+  while True:
+      update_models()
+      run_incremental_models()
+      time.sleep(int(os.getenv("DATAEMON_INTERVAL") or 5))
 
 if __name__ == "__main__":
+  print("Starting dbt run")
   setup()
-  while True:
-    update_models()
-    run_incremental_models()
-    time.sleep(int(os.getenv("DATAEMON_INTERVAL") or 5))
+  # check if we need to run in batch
+  pending_doc_count = get_pending_doc_count()
+  print(f"Pending doc count: {pending_doc_count}")
+  process_in_batch = pending_doc_count > int(os.getenv("DBT_BATCH_PROCESS_LIMIT") or 100000)
+  if process_in_batch:
+    print("Processing in batches")
+    run_dbt_in_batches()
+  else:
+    run_dbt()
diff --git a/dbt/dbt_project.yml b/dbt/dbt_project.yml
@@ -9,3 +9,6 @@ target-path: "target"
 clean-targets: ["target", "dbt_modules"]
 macro-paths: ["macros"]
 log-path: "logs"
+vars:
+  start_timestamp: null
+  batch_size: null