Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

feat(sandbox): adding docker sandbox #1517

Merged
merged 19 commits into from
Jan 28, 2025
Merged
Show file tree
Hide file tree
Changes from 5 commits
Commits
Show all changes
19 commits
Select commit Hold shift + click to select a range
fdcfb87
feat(sandbox): adding docker sandbox
ArslanSaleem Jan 14, 2025
bd18ebf
Merge branch 'release/v3' into feat/sandbox
ArslanSaleem Jan 16, 2025
3c72f5a
feat(sandbox): add serializer and deserializer
ArslanSaleem Jan 20, 2025
f0b5c87
Merge branch 'release/v3' into feat/sandbox
ArslanSaleem Jan 20, 2025
b630742
feat(dockerSandbox): complete implementation of docker sandbox
ArslanSaleem Jan 21, 2025
f880b01
fix(sandbox): spell mistakes
ArslanSaleem Jan 21, 2025
221c1dc
fix(sandbox): fix readme command
ArslanSaleem Jan 21, 2025
a341e07
fix(langchan): poetry lock file update
ArslanSaleem Jan 21, 2025
52d0489
Update pandasai/sandbox/sandbox.py
ArslanSaleem Jan 21, 2025
3ed84fe
fix(docker): make docker not use network
ArslanSaleem Jan 21, 2025
d0dc4c0
Merge branch 'feat/sandbox' of https://github.com/gventuri/pandas-ai …
ArslanSaleem Jan 21, 2025
afb593f
Merge branch 'release/v3' into feat/sandbox
ArslanSaleem Jan 21, 2025
a78039d
fix(ruff): errors in code formatting
ArslanSaleem Jan 21, 2025
5d99577
feat(sandbox): add notebook for docker sandbox
ArslanSaleem Jan 21, 2025
1806976
added notebook and documentation for sandbox
gdcsinaptik Jan 22, 2025
878f325
feat(Sandbox): typo and update documentation
ArslanSaleem Jan 24, 2025
7c18a8c
fix(docker): create docker from command line
ArslanSaleem Jan 24, 2025
bf1bf4f
Update extensions/sandbox/docker/pandasai_docker/docker_sandbox.py
ArslanSaleem Jan 24, 2025
79f71f6
Update extensions/sandbox/docker/pandasai_docker/docker_sandbox.py
ArslanSaleem Jan 24, 2025
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
31 changes: 31 additions & 0 deletions examples/docker_sandbox.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,31 @@
# Intall pandasai docker using
# pip install pandasai-docker
from pandasai_docker import DockerSandbox
from pandasai_openai.openai import OpenAI

import pandasai as pai

llm = OpenAI(api_token="sk-*******")

pai.config.set({"llm": llm})

df = pai.read_csv("/Users/arslan/Documents/SinapTik/pandas-ai/artists.csv")


sandbox = DockerSandbox()
agent = Agent([df], memory_size=10, sandbox=sandbox)
gventuri marked this conversation as resolved.
Show resolved Hide resolved
# Chat with the Agent
response = agent.chat("plot top five artists streams")

# destroy container after usage or let class destructor destroys it.
sanbox.stop()
gventuri marked this conversation as resolved.
Show resolved Hide resolved


# Use custom docker image
sandbox = DockerSandbox("pandaai-sandbox", "/path/to/Dockerfile")
agent = Agent([df], memory_size=10, sandbox=sandbox)
# Chat with the Agent
response = agent.chat("plot top five artists streams")

# destroy container after usage or let class destructor destroys it.
sandbox.stop()
9 changes: 9 additions & 0 deletions extensions/sandbox/docker/README.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,9 @@
# Docker Sandbox Extension for PandasAI

## Installation

You can install this extension using poetry:

```bash
poetry install pandasai-docker
gventuri marked this conversation as resolved.
Show resolved Hide resolved
```
12 changes: 12 additions & 0 deletions extensions/sandbox/docker/pandasai_docker/Dockerfile
Original file line number Diff line number Diff line change
@@ -0,0 +1,12 @@
FROM python:3.9

LABEL image_name="pandaai-sandbox"
gventuri marked this conversation as resolved.
Show resolved Hide resolved

# Install required Python packages
RUN pip install pandas numpy matplotlib

# Set the working directory inside the container
WORKDIR /app

# Default command keeps the container running (useful for testing or debugging)
CMD ["sleep", "infinity"]
3 changes: 3 additions & 0 deletions extensions/sandbox/docker/pandasai_docker/__init__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
from .docker_sandbox import DockerSandbox

__all__ = ["DockerSandbox"]
188 changes: 188 additions & 0 deletions extensions/sandbox/docker/pandasai_docker/docker_sandbox.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,188 @@
import io
import logging
import os
import re
import tarfile
import uuid
from typing import Optional

import docker

from pandasai.sandbox import Sandbox

from .serializer import ResponseSerializer

logger = logging.getLogger(__name__)


class DockerSandbox(Sandbox):
def __init__(self, image_name="pandaai-sandbox", dockerfile_path=None):
gventuri marked this conversation as resolved.
Show resolved Hide resolved
super().__init__()
self._dockerfile_path: str = dockerfile_path or os.path.join(
os.path.dirname(__file__), "Dockerfile"
)
self._image_name: str = image_name
self._client: docker.DockerClient = docker.from_env()
self._container: Optional[docker.models.containers.Container] = None

# Build the image if it does not exist
if not self._image_exists():
self._build_image()

self._helper_code: str = self._read_start_code(
os.path.join(os.path.dirname(__file__), "serializer.py")
)

def _image_exists(self) -> bool:
try:
self._client.images.get(self._image_name)
return True
except docker.errors.ImageNotFound:
return False

def _build_image(self) -> None:
logger.info(
f"Building Docker image '{self._image_name}' from '{self._dockerfile_path}'..."
)
with open(self._dockerfile_path, "rb") as file:
self._client.images.build(fileobj=file, tag=self._image_name)

def start(self):
if not self._started:
logger.info(
f"Starting a Docker container from the image '{self._image_name}'"
)
self._container = self._client.containers.run(
self._image_name, command="sleep infinity", detach=True, tty=True
)
logger.info(
f"Started a Docker container with id '{self._container.id}' from the image '{self._image_name}'"
)
self._started = True

def stop(self) -> None:
if self._started and self._container:
logger.info(f"Stopping a Docker container with id '{self._container.id}''")
self._container.stop()
self._container.remove()
self._container = None
self._started = False

def _read_start_code(self, file_path: str) -> str:
"""Read helper start code from a file as a string.

Args:
file_path (str): Path to the file.

Returns:
str: Code as a string.
"""
with open(file_path, "r") as file:
return file.read()

def _exec_code(self, code: str, enviroment: dict) -> dict:
gventuri marked this conversation as resolved.
Show resolved Hide resolved
"""Execute Python code in a Docker container.

Args:
code (str): Code to execute.
enviroment (dict): Environment variables to pass to the container.

Returns:
dict: Result of the code execution.
"""
if not self._container:
raise RuntimeError("Container is not running.")

sql_queries = self._extract_sql_queries_from_code(code)

# Temporary chart storage path
chart_path = "/tmp/temp_chart.png"
# actual chart path
original_chart_path = None

if png_paths := re.findall(r"'([^']+\.png)'", code):
original_chart_path = png_paths[0]

# update chart path
code = re.sub(
r"""(['"])([^'"]*\.png)\1""",
lambda m: f"{m.group(1)}{chart_path}{m.group(1)}",
code,
)

# Execute SQL queries, save the query results to CSV files
datasets_map = {}
for sql_query in sql_queries:
execute_sql_query_func = enviroment.get("execute_sql_query")
if execute_sql_query_func is None:
raise RuntimeError(
"execute_sql_query function is not defined in the environment."
)

query_df = execute_sql_query_func(sql_query)
filename = f"{uuid.uuid4().hex}.csv"
# Pass the files to the container for further processing
self.pass_csv(query_df, filename=filename)
datasets_map[sql_query] = filename

# Add the datasets_map variable to the code
dataset_map = f"""
datasets_map = {datasets_map}

def execute_sql_query(sql_query):
filename = datasets_map[sql_query]
filepath = os.path.join("/tmp", filename)
return pd.read_csv(filepath)

"""
# serialization code to get output from docker
end_code = """
print(parser.serialize(result))
"""
# Concatenate code and helper code
code = self._helper_code + dataset_map + code + end_code

# Compile the code for errors
self._compile_code(code)
gventuri marked this conversation as resolved.
Show resolved Hide resolved

# Replace double quotes with escaped double quotes for command line code arguments
code = code.replace('"', '\\"')

logger.info(f"Submitting code to docker container {code}")

exit_code, output = self._container.exec_run(
cmd=f'python -c "{code}"', demux=True
)

if exit_code != 0:
raise RuntimeError(f"Error executing code: {output[1].decode()}")

response = output[0].decode()
return ResponseSerializer.deserialize(response, original_chart_path)

def pass_csv(self, csv_data, filename="file.csv") -> None:
gventuri marked this conversation as resolved.
Show resolved Hide resolved
if not self._container:
raise RuntimeError("Container is not running.")

# Convert the DataFrame to a CSV string
csv_string = csv_data.to_csv(index=False)

# Create a tar archive in memory
tar_stream = io.BytesIO()
with tarfile.open(fileobj=tar_stream, mode="w") as tar:
# Add the CSV string as a file in the tar archive
csv_bytes = csv_string.encode("utf-8")
tarinfo = tarfile.TarInfo(name=filename)
tarinfo.size = len(csv_bytes)
tar.addfile(tarinfo, io.BytesIO(csv_bytes))

# Seek to the beginning of the stream
tar_stream.seek(0)

# Transfer the tar archive to the container
self._container.put_archive("/tmp", tar_stream)

def __del__(self) -> None:
if self._container:
self._container.stop()
self._container.remove()
73 changes: 73 additions & 0 deletions extensions/sandbox/docker/pandasai_docker/serializer.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,73 @@
import base64
import datetime
import json
import os # important to import
gventuri marked this conversation as resolved.
Show resolved Hide resolved
import tarfile # important to import
from json import JSONEncoder

import numpy as np
import pandas as pd


class ResponseSerializer:
@staticmethod
def serialize_dataframe(df: pd.DataFrame) -> dict:
if df.empty:
return {"columns": [], "data": [], "index": []}
return df.to_dict(orient="split")

@staticmethod
def serialize(result: dict) -> str:
if result["type"] == "dataframe":
if isinstance(result["value"], pd.Series):
result["value"] = result["value"].to_frame()
result["value"] = ResponseSerializer.serialize_dataframe(result["value"])

elif result["type"] == "plot" and isinstance(result["value"], str):
with open(result["value"], "rb") as image_file:
image_data = image_file.read()
result["value"] = base64.b64encode(image_data).decode()

return json.dumps(result, cls=CustomEncoder)

@staticmethod
def deserialize(response: str, chart_path: str = None) -> dict:
result = json.loads(response)
if result["type"] == "dataframe":
json_data = result["value"]
result["value"] = pd.DataFrame(
data=json_data["data"],
index=json_data["index"],
columns=json_data["columns"],
)

elif result["type"] == "plot" and chart_path:
image_data = base64.b64decode(result["value"])

# Write the binary data to a file
with open(chart_path, "wb") as image_file:
image_file.write(image_data)

result["value"] = chart_path

return result


class CustomEncoder(JSONEncoder):
def default(self, obj):
if isinstance(obj, (np.integer, np.int64)):
return int(obj)

if isinstance(obj, (np.floating, np.float64)):
return float(obj)

if isinstance(obj, (pd.Timestamp, datetime.datetime, datetime.date)):
return obj.isoformat()

if isinstance(obj, pd.DataFrame):
return ResponseSerializer.serialize_dataframe(obj)

return super().default(obj)


parser = ResponseSerializer()
Loading
Loading