Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

feat(security): Implement sandboxed code execution #664

Open
wants to merge 2 commits into
base: main
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
7 changes: 4 additions & 3 deletions ARCHITECTURE.md
Original file line number Diff line number Diff line change
Expand Up @@ -90,10 +90,11 @@ Devika's cognitive abilities are powered by a collection of specialized sub-agen
- Provides a human-like confirmation of the action to the user

### Runner
- Executes the written code in a sandboxed environment
- Executes code in a secure sandboxed environment using firejail
- Validates code for security concerns before execution
- Restricts dangerous imports and function calls
- Prevents network access and filesystem access outside sandbox
- Handles different OS environments (Mac, Linux, Windows)
- Streams command output to user in real-time
- Gracefully handles errors and exceptions

### Feature
- Implements a new feature based on user's specification
Expand Down
6 changes: 4 additions & 2 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,7 @@

![devika screenshot](.assets/devika-screenshot.png)

> [!IMPORTANT]
> [!IMPORTANT]
> This project is currently in a very early development/experimental stage. There are a lot of unimplemented/broken features at the moment. Contributions are welcome to help out with the progress!

## Table of Contents
Expand Down Expand Up @@ -61,10 +61,12 @@ Version's requirements
- Python >= 3.10 and < 3.12
- NodeJs >= 18
- bun
- firejail (for secure code execution)
```

- Install uv - Python Package manager [download](https://github.com/astral-sh/uv)
- Install bun - JavaScript runtime [download](https://bun.sh/docs/installation)
- Install firejail - Security sandbox [install with `sudo apt-get install firejail`]
- For ollama [ollama setup guide](docs/Installation/ollama.md) (optinal: if you don't want to use the local models then you can skip this step)
- For API models, configure the API keys via setting page in UI.

Expand All @@ -84,7 +86,7 @@ To install Devika, follow these steps:
3. Create a virtual environment and install the required dependencies (you can use any virtual environment manager):
```bash
uv venv

# On macOS and Linux.
source .venv/bin/activate

Expand Down
13 changes: 10 additions & 3 deletions devika.py
Original file line number Diff line number Diff line change
Expand Up @@ -23,14 +23,15 @@
from src.state import AgentState
from src.agents import Agent
from src.llm import LLM
from src.sandbox.code_runner import CodeRunner


app = Flask(__name__)
CORS(app, resources={r"/*": {"origins": # Change the origin to your frontend URL
[
"https://localhost:3000",
"http://localhost:3000",
]}})
]}})
app.register_blueprint(project_bp)
socketio.init_app(app)

Expand Down Expand Up @@ -157,8 +158,14 @@ def run_code():
data = request.json
project_name = data.get("project_name")
code = data.get("code")
# TODO: Implement code execution logic
return jsonify({"message": "Code execution started"})

if not code:
return jsonify({"success": False, "error": "No code provided"}), 400

runner = CodeRunner()
result = runner.run(code)

return jsonify(result)


@app.route("/api/calculate-tokens", methods=["POST"])
Expand Down
92 changes: 41 additions & 51 deletions src/agents/runner/runner.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,7 @@
from src.state import AgentState
from src.project import ProjectManager
from src.services.utils import retry_wrapper, validate_responses
from src.sandbox.code_runner import CodeRunner

PROMPT = open("src/agents/runner/prompt.jinja2", "r").read().strip()
RERUNNER_PROMPT = open("src/agents/runner/rerunner.jinja2", "r").read().strip()
Expand Down Expand Up @@ -58,7 +59,7 @@ def validate_response(self, response: str):
return False
else:
return response["commands"]

@validate_responses
def validate_rerunner_response(self, response: str):
if "action" not in response and "response" not in response:
Expand All @@ -75,30 +76,27 @@ def run_code(
conversation: list,
code_markdown: str,
system_os: str
):
):
retries = 0

runner = CodeRunner()

for command in commands:
command_set = command.split(" ")
command_failed = False

process = subprocess.run(
command_set,
stdout=subprocess.PIPE,
stderr=subprocess.PIPE,
cwd=project_path
)
command_output = process.stdout.decode('utf-8')
command_failed = process.returncode != 0


# Run command in sandbox
result = runner.run(command)
command_output = result["output"]
command_failed = not result["success"]

new_state = AgentState().new_state()
new_state["internal_monologue"] = "Running code..."
new_state["terminal_session"]["title"] = "Terminal"
new_state["terminal_session"]["command"] = command
new_state["terminal_session"]["output"] = command_output
AgentState().add_to_current_state(project_name, new_state)
time.sleep(1)

while command_failed and retries < 2:
new_state = AgentState().new_state()
new_state["internal_monologue"] = "Oh seems like there is some error... :("
Expand All @@ -107,59 +105,55 @@ def run_code(
new_state["terminal_session"]["output"] = command_output
AgentState().add_to_current_state(project_name, new_state)
time.sleep(1)

prompt = self.render_rerunner(
conversation=conversation,
code_markdown=code_markdown,
system_os=system_os,
commands=commands,
error=command_output
)

response = self.llm.inference(prompt, project_name)

valid_response = self.validate_rerunner_response(response)

if not valid_response:
return False

action = valid_response["action"]

if action == "command":
command = valid_response["command"]
response = valid_response["response"]

ProjectManager().add_message_from_devika(project_name, response)

command_set = command.split(" ")
command_failed = False

process = subprocess.run(
command_set,
stdout=subprocess.PIPE,
stderr=subprocess.PIPE,
cwd=project_path
)
command_output = process.stdout.decode('utf-8')
command_failed = process.returncode != 0


# Run command in sandbox
result = runner.run(command)
command_output = result["output"]
command_failed = not result["success"]

new_state = AgentState().new_state()
new_state["internal_monologue"] = "Running code..."
new_state["terminal_session"]["title"] = "Terminal"
new_state["terminal_session"]["command"] = command
new_state["terminal_session"]["output"] = command_output
AgentState().add_to_current_state(project_name, new_state)
time.sleep(1)

if command_failed:
retries += 1
else:
break
elif action == "patch":
response = valid_response["response"]

ProjectManager().add_message_from_devika(project_name, response)

code = Patcher(base_model=self.base_model).execute(
conversation=conversation,
code_markdown=code_markdown,
Expand All @@ -168,29 +162,25 @@ def run_code(
system_os=system_os,
project_name=project_name
)

Patcher(base_model=self.base_model).save_code_to_project(code, project_name)

command_set = command.split(" ")
command_failed = False

process = subprocess.run(
command_set,
stdout=subprocess.PIPE,
stderr=subprocess.PIPE,
cwd=project_path
)
command_output = process.stdout.decode('utf-8')
command_failed = process.returncode != 0


# Run command in sandbox
result = runner.run(command)
command_output = result["output"]
command_failed = not result["success"]

new_state = AgentState().new_state()
new_state["internal_monologue"] = "Running code..."
new_state["terminal_session"]["title"] = "Terminal"
new_state["terminal_session"]["command"] = command
new_state["terminal_session"]["output"] = command_output
AgentState().add_to_current_state(project_name, new_state)
time.sleep(1)

if command_failed:
retries += 1
else:
Expand All @@ -207,9 +197,9 @@ def execute(
) -> str:
prompt = self.render(conversation, code_markdown, os_system)
response = self.llm.inference(prompt, project_name)

valid_response = self.validate_response(response)

self.run_code(
valid_response,
project_path,
Expand All @@ -219,4 +209,4 @@ def execute(
os_system
)

return valid_response
return valid_response
75 changes: 75 additions & 0 deletions src/sandbox/code_runner.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,75 @@
"""
Code execution manager with security restrictions.
"""
from typing import Dict, Optional, Tuple
import os
import re
from .firejail import Sandbox

class CodeRunner:
"""
Manages secure code execution with restrictions and validation.
"""

# Restricted imports that could be dangerous
RESTRICTED_IMPORTS = {
'os.system', 'subprocess', 'pty', 'socket', 'requests',
'urllib', 'ftplib', 'telnetlib', 'smtplib'
}

# Restricted function calls
RESTRICTED_CALLS = {
r'eval\s*\(', r'exec\s*\(', r'open\s*\(',
r'__import__\s*\(', r'globals\s*\(', r'locals\s*\('
}

def __init__(self):
self.sandbox = Sandbox()

def validate_code(self, code: str) -> Tuple[bool, str]:
"""
Validate code for security concerns.

Returns:
Tuple of (is_valid, error_message)
"""
# Check for restricted imports
for imp in self.RESTRICTED_IMPORTS:
if imp in code:
return False, f"Use of restricted import: {imp}"

# Check for restricted function calls
for call in self.RESTRICTED_CALLS:
if re.search(call, code):
return False, f"Use of restricted function call pattern: {call}"

return True, ""

def run(self, code: str, timeout: int = 30) -> Dict[str, str]:
"""
Run code securely with validation and sandboxing.

Args:
code: The Python code to execute
timeout: Maximum execution time in seconds

Returns:
Dict containing execution results
"""
# Validate code
is_valid, error = self.validate_code(code)
if not is_valid:
return {
"success": False,
"error": error,
"output": "",
}

# Run in sandbox
stdout, stderr, return_code = self.sandbox.run_code(code, timeout)

return {
"success": return_code == 0,
"output": stdout,
"error": stderr if stderr else "",
}
Loading