stitionai · erkinalp · Dec 18, 2024 · Dec 18, 2024
diff --git a/ARCHITECTURE.md b/ARCHITECTURE.md
@@ -90,10 +90,11 @@ Devika's cognitive abilities are powered by a collection of specialized sub-agen
 - Provides a human-like confirmation of the action to the user
 
 ### Runner
-- Executes the written code in a sandboxed environment 
+- Executes code in a secure sandboxed environment using firejail
+- Validates code for security concerns before execution
+- Restricts dangerous imports and function calls
+- Prevents network access and filesystem access outside sandbox
 - Handles different OS environments (Mac, Linux, Windows)
-- Streams command output to user in real-time
-- Gracefully handles errors and exceptions
 
 ### Feature
 - Implements a new feature based on user's specification

diff --git a/README.md b/README.md
@@ -6,7 +6,7 @@
 
 ![devika screenshot](.assets/devika-screenshot.png)
 
-> [!IMPORTANT]  
+> [!IMPORTANT]
 > This project is currently in a very early development/experimental stage. There are a lot of unimplemented/broken features at the moment. Contributions are welcome to help out with the progress!
 
 ## Table of Contents
@@ -61,10 +61,12 @@ Version's requirements
   - Python >= 3.10 and < 3.12
   - NodeJs >= 18
   - bun
+  - firejail (for secure code execution)
 ```
 
 - Install uv - Python Package manager [download](https://github.com/astral-sh/uv)
 - Install bun - JavaScript runtime [download](https://bun.sh/docs/installation)
+- Install firejail - Security sandbox [install with `sudo apt-get install firejail`]
 - For ollama [ollama setup guide](docs/Installation/ollama.md) (optinal: if you don't want to use the local models then you can skip this step)
 - For API models, configure the API keys via setting page in UI.
 
@@ -84,7 +86,7 @@ To install Devika, follow these steps:
 3. Create a virtual environment and install the required dependencies (you can use any virtual environment manager):
    ```bash
    uv venv
-   
+
    # On macOS and Linux.
    source .venv/bin/activate
 

diff --git a/devika.py b/devika.py
@@ -23,14 +23,15 @@
 from src.state import AgentState
 from src.agents import Agent
 from src.llm import LLM
+from src.sandbox.code_runner import CodeRunner
 
 
 app = Flask(__name__)
 CORS(app, resources={r"/*": {"origins": # Change the origin to your frontend URL
                              [
                                  "https://localhost:3000",
                                  "http://localhost:3000",
-                                 ]}}) 
+                                 ]}})
 app.register_blueprint(project_bp)
 socketio.init_app(app)
 
@@ -157,8 +158,14 @@ def run_code():
     data = request.json
     project_name = data.get("project_name")
     code = data.get("code")
-    # TODO: Implement code execution logic
-    return jsonify({"message": "Code execution started"})
+
+    if not code:
+        return jsonify({"success": False, "error": "No code provided"}), 400
+
+    runner = CodeRunner()
+    result = runner.run(code)
+
+    return jsonify(result)
 
 
 @app.route("/api/calculate-tokens", methods=["POST"])

diff --git a/src/agents/runner/runner.py b/src/agents/runner/runner.py
@@ -11,6 +11,7 @@
 from src.state import AgentState
 from src.project import ProjectManager
 from src.services.utils import retry_wrapper, validate_responses
+from src.sandbox.code_runner import CodeRunner
 
 PROMPT = open("src/agents/runner/prompt.jinja2", "r").read().strip()
 RERUNNER_PROMPT = open("src/agents/runner/rerunner.jinja2", "r").read().strip()
@@ -58,7 +59,7 @@ def validate_response(self, response: str):
             return False
         else:
             return response["commands"]
-    
+
     @validate_responses
     def validate_rerunner_response(self, response: str):
         if "action" not in response and "response" not in response:
@@ -75,30 +76,27 @@ def run_code(
         conversation: list,
         code_markdown: str,
         system_os: str
-    ):  
+    ):
         retries = 0
-
+        runner = CodeRunner()
+
         for command in commands:
             command_set = command.split(" ")
             command_failed = False
-
-            process = subprocess.run(
-                command_set,
-                stdout=subprocess.PIPE,
-                stderr=subprocess.PIPE,
-                cwd=project_path
-            )
-            command_output = process.stdout.decode('utf-8')
-            command_failed = process.returncode != 0
-
+
+            # Run command in sandbox
+            result = runner.run(command)
+            command_output = result["output"]
+            command_failed = not result["success"]
+
             new_state = AgentState().new_state()
             new_state["internal_monologue"] = "Running code..."
             new_state["terminal_session"]["title"] = "Terminal"
             new_state["terminal_session"]["command"] = command
             new_state["terminal_session"]["output"] = command_output
             AgentState().add_to_current_state(project_name, new_state)
             time.sleep(1)
-            
+
             while command_failed and retries < 2:
                 new_state = AgentState().new_state()
                 new_state["internal_monologue"] = "Oh seems like there is some error... :("
@@ -107,59 +105,55 @@ def run_code(
                 new_state["terminal_session"]["output"] = command_output
                 AgentState().add_to_current_state(project_name, new_state)
                 time.sleep(1)
-                
+
                 prompt = self.render_rerunner(
                     conversation=conversation,
                     code_markdown=code_markdown,
                     system_os=system_os,
                     commands=commands,
                     error=command_output
                 )
-                
+
                 response = self.llm.inference(prompt, project_name)
-                
+
                 valid_response = self.validate_rerunner_response(response)
-                
+
                 if not valid_response:
                     return False
-                
+
                 action = valid_response["action"]
-                
+
                 if action == "command":
                     command = valid_response["command"]
                     response = valid_response["response"]
-                    
+
                     ProjectManager().add_message_from_devika(project_name, response)
-                    
+
                     command_set = command.split(" ")
                     command_failed = False
-
-                    process = subprocess.run(
-                        command_set,
-                        stdout=subprocess.PIPE,
-                        stderr=subprocess.PIPE,
-                        cwd=project_path
-                    )
-                    command_output = process.stdout.decode('utf-8')
-                    command_failed = process.returncode != 0
-
+
+                    # Run command in sandbox
+                    result = runner.run(command)
+                    command_output = result["output"]
+                    command_failed = not result["success"]
+
                     new_state = AgentState().new_state()
                     new_state["internal_monologue"] = "Running code..."
                     new_state["terminal_session"]["title"] = "Terminal"
                     new_state["terminal_session"]["command"] = command
                     new_state["terminal_session"]["output"] = command_output
                     AgentState().add_to_current_state(project_name, new_state)
                     time.sleep(1)
-                    
+
                     if command_failed:
                         retries += 1
                     else:
                         break
                 elif action == "patch":
                     response = valid_response["response"]
-                    
+
                     ProjectManager().add_message_from_devika(project_name, response)
-                    
+
                     code = Patcher(base_model=self.base_model).execute(
                         conversation=conversation,
                         code_markdown=code_markdown,
@@ -168,29 +162,25 @@ def run_code(
                         system_os=system_os,
                         project_name=project_name
                     )
-                    
+
                     Patcher(base_model=self.base_model).save_code_to_project(code, project_name)
-                    
+
                     command_set = command.split(" ")
                     command_failed = False
-
-                    process = subprocess.run(
-                        command_set,
-                        stdout=subprocess.PIPE,
-                        stderr=subprocess.PIPE,
-                        cwd=project_path
-                    )
-                    command_output = process.stdout.decode('utf-8')
-                    command_failed = process.returncode != 0
-
+
+                    # Run command in sandbox
+                    result = runner.run(command)
+                    command_output = result["output"]
+                    command_failed = not result["success"]
+
                     new_state = AgentState().new_state()
                     new_state["internal_monologue"] = "Running code..."
                     new_state["terminal_session"]["title"] = "Terminal"
                     new_state["terminal_session"]["command"] = command
                     new_state["terminal_session"]["output"] = command_output
                     AgentState().add_to_current_state(project_name, new_state)
                     time.sleep(1)
-                    
+
                     if command_failed:
                         retries += 1
                     else:
@@ -207,9 +197,9 @@ def execute(
     ) -> str:
         prompt = self.render(conversation, code_markdown, os_system)
         response = self.llm.inference(prompt, project_name)
-        
+
         valid_response = self.validate_response(response)
-        
+
         self.run_code(
             valid_response,
             project_path,
@@ -219,4 +209,4 @@ def execute(
             os_system
         )
 
-        return valid_response
+        return valid_response
diff --git a/src/sandbox/code_runner.py b/src/sandbox/code_runner.py
@@ -0,0 +1,75 @@
+"""
+Code execution manager with security restrictions.
+"""
+from typing import Dict, Optional, Tuple
+import os
+import re
+from .firejail import Sandbox
+
+class CodeRunner:
+    """
+    Manages secure code execution with restrictions and validation.
+    """
+
+    # Restricted imports that could be dangerous
+    RESTRICTED_IMPORTS = {
+        'os.system', 'subprocess', 'pty', 'socket', 'requests',
+        'urllib', 'ftplib', 'telnetlib', 'smtplib'
+    }
+
+    # Restricted function calls
+    RESTRICTED_CALLS = {
+        r'eval\s*\(', r'exec\s*\(', r'open\s*\(',
+        r'__import__\s*\(', r'globals\s*\(', r'locals\s*\('
+    }
+
+    def __init__(self):
+        self.sandbox = Sandbox()
+
+    def validate_code(self, code: str) -> Tuple[bool, str]:
+        """
+        Validate code for security concerns.
+
+        Returns:
+            Tuple of (is_valid, error_message)
+        """
+        # Check for restricted imports
+        for imp in self.RESTRICTED_IMPORTS:
+            if imp in code:
+                return False, f"Use of restricted import: {imp}"
+
+        # Check for restricted function calls
+        for call in self.RESTRICTED_CALLS:
+            if re.search(call, code):
+                return False, f"Use of restricted function call pattern: {call}"
+
+        return True, ""
+
+    def run(self, code: str, timeout: int = 30) -> Dict[str, str]:
+        """
+        Run code securely with validation and sandboxing.
+
+        Args:
+            code: The Python code to execute
+            timeout: Maximum execution time in seconds
+
+        Returns:
+            Dict containing execution results
+        """
+        # Validate code
+        is_valid, error = self.validate_code(code)
+        if not is_valid:
+            return {
+                "success": False,
+                "error": error,
+                "output": "",
+            }
+
+        # Run in sandbox
+        stdout, stderr, return_code = self.sandbox.run_code(code, timeout)
+
+        return {
+            "success": return_code == 0,
+            "output": stdout,
+            "error": stderr if stderr else "",
+        }