Merge pull request #199 from ORNL/ml_epochs

Ml epochs
ORNL · Jan 13, 2025 · e6b164a · e6b164a
2 parents a1f51cf + 8a5e98b
commit e6b164a
Show file tree

Hide file tree

Showing 34 changed files with 608 additions and 219 deletions.
diff --git a/.github/workflows/run-checks.yml → .github/workflows/checks.yml b/.github/workflows/run-checks.yml → .github/workflows/checks.yml
@@ -1,30 +1,26 @@
 name: Linter, formatter, and docs checks
-on: [pull_request]
 
-permissions:
-  contents: read
+on: pull_request
 
 jobs:
-  build:
-    runs-on: ubuntu-latest
-    if: "!contains(github.event.head_commit.message, 'CI Bot')"
 
+  checks:
+    runs-on: ubuntu-22.04
+    if: "!contains(github.event.head_commit.message, 'CI Bot')"
     steps:
       - uses: actions/checkout@v4
-        with:
-          fetch-depth: 1
 
-      - name: Set up Python 3.10
+      - name: Set up Python 3.12
         uses: actions/setup-python@v5
         with:
-          python-version: "3.10"
+          python-version: "3.12"
           cache: "pip"
 
       - name: Install package and dependencies
         run: |
-          python -m pip install --upgrade pip
-          python -m pip install ruff
-          python -m pip install .[docs]
+          pip install --upgrade pip
+          pip install ruff
+          pip install .[docs]
 
       - name: Run linter and formatter checks using ruff
         run: make checks

diff --git a/.github/workflows/run-llm-tests.yml b/.github/workflows/run-llm-tests.yml
@@ -0,0 +1,52 @@
+name: LLM Tests
+on: [pull_request]
+
+jobs:
+
+  build:
+    runs-on: ubuntu-latest
+    strategy:
+      matrix:
+        python-version: [ "3.10", "3.11", "3.12" ]
+    env:
+      MONGO_ENABLED: true
+      LMDB_ENABLED: false
+    timeout-minutes: 60
+    if: "!contains(github.event.head_commit.message, 'CI Bot')"
+
+    steps:
+      - uses: actions/checkout@v4
+        with:
+          fetch-depth: 1
+
+      - name: Set up Python ${{ matrix.python-version }}
+        uses: actions/setup-python@v5
+        with:
+          python-version: ${{ matrix.python-version }}
+          cache: "pip"
+
+      - name: Show OS Info
+        run: '[[ "$OSTYPE" == "linux-gnu"* ]] && { echo "OS Type: Linux"; (command -v lsb_release &> /dev/null && lsb_release -a) || cat /etc/os-release; uname -r; } || [[ "$OSTYPE" == "darwin"* ]] && { echo "OS Type: macOS"; sw_vers; uname -r; } || echo "Unsupported OS type: $OSTYPE"'
+
+      - name: Start docker compose with redis
+        run: make services-mongo
+
+      - name: Upgrade pip
+        run: |  
+          python -m pip install --upgrade pip
+          python --version
+
+      - name: Test LLM
+        run: bash .github/workflows/run_examples.sh examples true llm_complex/llm_test_runner.py
+
+      - name: Shut down docker compose
+        run: make services-stop-mongo
+
+      - name: Clean up
+        run: |
+          make clean
+          find /home/runner/runners/ -type f -name "*.log" -exec sh -c 'echo {}; >"{}"' \; || true
+          docker image prune -a -f
+
+      - name: List large files
+        run: find . -type f -exec du -h {} + | sort -h
diff --git a/.github/workflows/run_examples.sh b/.github/workflows/run_examples.sh
@@ -25,7 +25,7 @@ fi
 
 # Function to run tests with common steps
 run_test() {
-  test_path="${EXAMPLES_DIR}/${1}_example.py"
+  test_path="${EXAMPLES_DIR}/${1}"
   test_type="$1"
   with_mongo="$2"
   echo "Test type=${test_type}"
@@ -39,6 +39,8 @@ run_test() {
     pip install .[mongo] > /dev/null 2>&1
   fi
 
+
+  # The following block is only needed to install special dependencies.
   if [[ "$test_type" =~ "mlflow" ]]; then
     echo "Installing mlflow"
     pip install .[mlflow] > /dev/null 2>&1
@@ -53,6 +55,7 @@ run_test() {
     pip install .[ml_dev] > /dev/null 2>&1
   elif [[ "$test_type" =~ "llm_complex" ]]; then
     echo "Installing ml_dev dependencies"
+    pip install .[dask] > /dev/null 2>&1
     pip install .[ml_dev]
     echo "Defining python path for llm_complex..."
     export PYTHONPATH=$PYTHONPATH:${EXAMPLES_DIR}/llm_complex
@@ -62,15 +65,13 @@ run_test() {
   echo "Running $test_path ..."
   python "$test_path" | tee output.log
   echo "Ok, ran $test_path."
-  # Check for errors in the output
   if grep -iq "error" output.log; then
     echo "Test $test_path failed! See output.log for details."
     exit 1
   fi
 
   echo "Great, no errors to run $test_path."
 
-  # Clean up the log file
   rm output.log
 }
 
@@ -81,7 +82,7 @@ echo "Using examples directory: $EXAMPLES_DIR"
 echo "With Mongo? ${WITH_MONGO}"
 
 # Define the test cases
-default_tests=("instrumented_simple" "instrumented_loop" "dask" "mlflow" "tensorboard" "single_layer_perceptron" "llm_complex/llm_main")
+default_tests=("instrumented_simple_example.py" "instrumented_loop_example.py" "distributed_consumer_example.py" "dask_example.py" "mlflow_example.py" "tensorboard_example.py" "single_layer_perceptron_example.py" "llm_complex/llm_main_example.py")
 
 # Use the third argument if provided, otherwise use default tests
 if [[ -n "$3" ]]; then

diff --git a/.gitignore b/.gitignore
@@ -4,7 +4,7 @@
 **/*build*
 **/*egg*
 **/*pycache*
-**/*dist*
+#**/*dist*
 **/*mlflow.db*
 **/*mnist*
 **/*tensorboard_events*
@@ -23,3 +23,4 @@ deployment/data
 **/*output_data*
 examples/llm_complex/input_data
 tmp_tests/
+nohup.out
diff --git a/Makefile b/Makefile
@@ -41,11 +41,11 @@ clean:
 	find . -type f -name "*.pth" -exec rm -f {} \; || true
 	find . -type f -name "mlflow.db" -exec rm -f {} \; || true
 	find . -type d -name "mlruns" -exec rm -rf {} \; 2>/dev/null || true
-	find . -type d -name "mlruns" -exec rm -rf {} \; 2>/dev/null || true
 	find . -type d -name "__pycache__" -exec rm -rf {} \;  2>/dev/null || true
 	find . -type d -name "*tfevents*" -exec rm -rf {} \;  2>/dev/null || true
 	find . -type d -name "*output_data*" -exec rm -rf {} \;  2>/dev/null || true
-	# sphinx-build -M clean docs docs/_build This needs to be fixed.
+	find . -type f -name "*nohup*" -exec rm -rf {} \;  2>/dev/null || true
+	sphinx-build -M clean docs docs/_build > /dev/null 2>&1 || true
 
 # Build the HTML documentation using Sphinx
 .PHONY: docs
@@ -96,7 +96,3 @@ tests:
 .PHONY: tests-notebooks
 tests-notebooks:
 	pytest --nbmake "notebooks/" --nbmake-timeout=600 --ignore=notebooks/dask_from_CLI.ipynb
-
-.PHONY: tests-all
-tests-all:
-	pytest
diff --git a/examples/distributed_consumer_example.py b/examples/distributed_consumer_example.py
@@ -0,0 +1,76 @@
+import os
+import subprocess
+import uuid
+from time import sleep
+from flowcept import Flowcept, FlowceptTask
+
+def execute_cmd(command: str) -> int:
+    """
+    Executes a command using nohup in the background and returns the process ID (PID).
+
+    Parameters
+    ----------
+    command : str
+        The command to be executed.
+
+    Returns
+    -------
+    int
+        The PID of the background process.
+    """
+    try:
+        # Append nohup and redirect outputs to /dev/null for background execution
+        nohup_command = f"nohup {command} > /dev/null 2>&1 & echo $!"
+        # Execute the command in a shell and capture the PID
+        print(f"Executing: {nohup_command}")
+        process = subprocess.run(nohup_command, shell=True, check=True, executable='/bin/bash', text=True, capture_output=True)
+        pid = int(process.stdout.strip())  # Get the PID from the output
+        print(f"Started process with PID: {pid}")
+        return pid
+    except subprocess.CalledProcessError as e:
+        print(f"Error executing command: {command}\n{e}")
+        return -1
+
+
+def kill_process(pid: int) -> None:
+    """
+    Kills a process by its PID.
+
+    Parameters
+    ----------
+    pid : int
+        The PID of the process to be killed.
+    """
+    try:
+        os.kill(pid, 9)  # Send SIGKILL to the process
+        print(f"Process {pid} killed successfully.")
+    except ProcessLookupError:
+        print(f"No process found with PID: {pid}.")
+    except PermissionError:
+        print(f"Permission denied to kill PID: {pid}.")
+
+
+def simple_flowcept_task(workflow_id):
+
+    with Flowcept(start_persistence=False, workflow_id=workflow_id, bundle_exec_id=workflow_id):
+        with FlowceptTask(used={"a": 1}) as t:
+            t.end(generated={"b": 2})
+
+
+if __name__ == "__main__":
+
+    workflow_id = str(uuid.uuid4())
+    print(workflow_id)
+
+    pid = execute_cmd(f"python -c 'from flowcept import Flowcept; Flowcept.start_consumption_services(\"{workflow_id}\")'")
+    sleep(1)
+
+    simple_flowcept_task(workflow_id)
+
+    sleep(15)  # Give enough time for the consumer services to do their thing
+
+    kill_process(pid)
+
+    tasks = Flowcept.db.query({"workflow_id": workflow_id})
+    assert len(tasks) == 1
+    print(tasks)