diff --git a/.gitmodules b/.gitmodules
index 5b1be84..3647a21 100644
--- a/.gitmodules
+++ b/.gitmodules
@@ -40,3 +40,7 @@
 [submodule "repos/pytorch-image-models"]
 	path = repos/pytorch-image-models
 	url = https://github.com/huggingface/pytorch-image-models.git
+[submodule "evaluation/open_devin"]
+	path = evaluation/open_devin
+	url = https://github.com/OpenDevin/OpenDevin.git
+
diff --git a/Execution env/README.md b/Execution env/README.md
deleted file mode 100644
index e3ef6ba..0000000
--- a/Execution env/README.md	
+++ /dev/null
@@ -1,11 +0,0 @@
-You can evaluate your own model by downloading the following docker 
-
-https://ml-bench-docker.s3.amazonaws.com/ml-bench-docker/ml_bench.tar
-
-then run
-
-`docker load -i ml_bench.tar`
-
-`docker run -d --name your_container_name -p host_port:container_port your_image_name`
-
-The .tar file has a README file for instructions, and it stores the results of our experiments.
diff --git a/README.md b/README.md
index 3d803d1..6ce3974 100644
--- a/README.md
+++ b/README.md
@@ -1,60 +1,33 @@
 # ML-Bench: Large Language Models Leverage Open-source Libraries for Machine Learning Tasks
 
 <p align="center">
-   📖 <a href="https://arxiv.org/abs/2311.09835" target="_blank">Paper</a>  • 🚀 <a href="https://ml-bench.github.io/" target="_blank">Github Page</a>  • 📊 <a href="https://drive.google.com/drive/folders/1e86FhLjxXK837SgR8a29cztx9UfxPQzS" target="_blank">Data</a> 
+   📖 <a href="https://arxiv.org/abs/2311.09835" target="_blank">Paper</a>  • 🚀 <a href="https://ml-bench.github.io/" target="_blank">Github Page</a>  • 📊 <a href="https://huggingface.co/datasets/super-dainiu/ml-bench" target="_blank">Data</a> 
 </p>
 
 ![Alt text](https://github.com/gersteinlab/ML-Bench/blob/master/assets/image.png)
 
-## Execution Env and data
 
-We have annotated the ML-Bench with new data, filtered and modified it, and we will subsequently update it with the new execution environment and data. 
+## Docker Setup
 
-The execution environment in old version (same version as arxiv paper 2311) can be found in ./Execution env, for data in old version please refer to https://drive.google.com/drive/folders/1e86FhLjxXK837SgR8a29cztx9UfxPQzS?usp=drive_link .
+Please refer to [envs](envs/README.md) for details.
 
-## GPT Calling
+## OpenAI Calling
 
-You can use the following script to reproduce GPT's performance on this task：
-```python
-sh script/GPT/run.sh
-```
-
-You need to change parameter settings in `script/GPT/run.sh` :
-
-* type: Choose from quarter or full.
-
-* model: Model name 
-
-* input_file: File path of dataset
-
-* answer_file: Original answer json format from GPT.
-
-* parsing_file: Post-process the output of GPT in jsonl format to obtain executable code segments.
-
-* readme_type: Choose from oracle_segment and readme
-
-  *# oracle_segment: The code paragraph in the readme that is most relevant to the task*
-
-  *# readme: The entire text of the readme in the repository where the task is located*
-
-* engine_name: Choose from gpt-35-turbo-16k and gpt-4-32.
-
-* n_turn: GPT returns the number of executable codes (5 times in the paper experiment).
+Please refer to [openai](script/openai/README.md) for details.
 
-* openai_key: Your key.
+## Open Source Model Fine-tuning
 
-## CodeLlama-7b Fine-tuning
-Please refer to [CodeLlama-7b](script/codellama/README.md) for details.
+Please refer to [finetune](script/finetune/README.md) for details.
 
 ## Tools
 
 ### Get BM25 result
 
-Run `python script/tools/bm25.py` to generate BM25 results for the instructions and readme. Ensure to update the original dataset `path` and output `path` which includes the BM25 results.
+Run `python utils/bm25.py` to generate BM25 results for the instructions and readme. Ensure to update the original dataset `path` and output `path` which includes the BM25 results.
 
 ### Crawl README files from github repository
 
-Run `python script/tools/crawl.py` to fetch readme files from a specific GitHub repository. You'll need to modify the `url` within the code to retrieve the desired readme files.
+Run `python utils/crawl.py` to fetch readme files from a specific GitHub repository. You'll need to modify the `url` within the code to retrieve the desired readme files.
 
 ## Cite Us
 This project is inspired by some related projects. We would like to thank the authors for their contributions. If you find this project or dataset useful, please cite it:
diff --git a/envs/README.md b/envs/README.md
new file mode 100644
index 0000000..cbefc64
--- /dev/null
+++ b/envs/README.md
@@ -0,0 +1,14 @@
+# Environment Setup
+
+## ML-Agent-Bench Docker Setup
+
+To run the ML-Agent-Bench Docker container, you can use the following command:
+
+```bash
+docker pull public.ecr.aws/i5g0m1f6/ml-bench
+docker run -it public.ecr.aws/i5g0m1f6/ml-bench /bin/bash
+```
+
+This will pull the latest ML-Agent-Bench Docker image and run it in an interactive shell. The container includes all the necessary dependencies to run the ML-Agent-Bench codebase.
+
+For ML-Agent-Bench in OpenDevin, please refer to the [OpenDevin setup guide](https://github.com/OpenDevin/OpenDevin/blob/main/evaluation/ml_bench/README.md).
\ No newline at end of file
diff --git a/evaluation/open_devin b/evaluation/open_devin
new file mode 160000
index 0000000..35c4c9c
--- /dev/null
+++ b/evaluation/open_devin
@@ -0,0 +1 @@
+Subproject commit 35c4c9cb3498668f7df094b255b1350a7802e1bc
diff --git a/script/combine_files.sh b/scripts/combine_files.sh
similarity index 100%
rename from script/combine_files.sh
rename to scripts/combine_files.sh
diff --git a/script/codellama/README.md b/scripts/finetune/README.md
similarity index 98%
rename from script/codellama/README.md
rename to scripts/finetune/README.md
index f990383..904e39b 100644
--- a/script/codellama/README.md
+++ b/scripts/finetune/README.md
@@ -1,4 +1,4 @@
-# CodeLlama-7b Fine-tuning
+# Open Source Model Fine-tuning
 
 ## Prerequisites
 Llama-recipes provides a pip distribution for easy install and usage in other projects. Alternatively, it can be installed from source.
diff --git a/script/codellama/chat_completion.py b/scripts/finetune/chat_completion.py
similarity index 100%
rename from script/codellama/chat_completion.py
rename to scripts/finetune/chat_completion.py
diff --git a/script/codellama/finetuning.py b/scripts/finetune/finetuning.py
similarity index 100%
rename from script/codellama/finetuning.py
rename to scripts/finetune/finetuning.py
diff --git a/script/codellama/mlbench_dataset.py b/scripts/finetune/mlbench_dataset.py
similarity index 100%
rename from script/codellama/mlbench_dataset.py
rename to scripts/finetune/mlbench_dataset.py
diff --git a/script/generate/generate.py b/scripts/generate/generate.py
similarity index 100%
rename from script/generate/generate.py
rename to scripts/generate/generate.py
diff --git a/script/generate/generate.sh b/scripts/generate/generate.sh
similarity index 100%
rename from script/generate/generate.sh
rename to scripts/generate/generate.sh
diff --git a/scripts/openai/README.md b/scripts/openai/README.md
new file mode 100644
index 0000000..9607595
--- /dev/null
+++ b/scripts/openai/README.md
@@ -0,0 +1,22 @@
+# OpenAI Calling
+
+To reproduce OpenAI's performance on this task, use the following script:
+```bash
+bash script/openai/run.sh
+```
+
+## Parameter Settings
+
+You need to change the parameter settings in `script/openai/run.sh`:
+
+- `type`: Choose from `quarter` or `full`.
+- `model`: Model name.
+- `input_file`: File path of the dataset.
+- `answer_file`: Original answer in JSON format from GPT.
+- `parsing_file`: Post-process the output of GPT in JSONL format to obtain executable code segments.
+- `readme_type`: Choose from `oracle_segment` and `readme`.
+  - `oracle_segment`: The code paragraph in the README that is most relevant to the task.
+  - `readme`: The entire text of the README in the repository where the task is located.
+- `engine_name`: Choose from `gpt-35-turbo-16k` and `gpt-4-32`.
+- `n_turn`: Number of executable codes GPT returns (5 times in the paper experiment).
+- `openai_key`: Your OpenAI API key.
\ No newline at end of file
diff --git a/script/GPT/query_gpt.py b/scripts/openai/call_openai.py
similarity index 100%
rename from script/GPT/query_gpt.py
rename to scripts/openai/call_openai.py
diff --git a/script/GPT/run.sh b/scripts/openai/run.sh
similarity index 96%
rename from script/GPT/run.sh
rename to scripts/openai/run.sh
index e95544d..335ca75 100644
--- a/script/GPT/run.sh
+++ b/scripts/openai/run.sh
@@ -23,7 +23,7 @@ engine_name="gpt-4-32k"
 
 n_turn=1
 
-python query_gpt.py \
+python call_openai.py \
  --readme_type ${readme_type} \
  --instruction ${instructions} \
  --nturn ${n_turn} \
diff --git a/utils/__init__.py b/utils/__init__.py
new file mode 100644
index 0000000..e69de29
diff --git a/script/tools/bm25.py b/utils/bm25.py
similarity index 100%
rename from script/tools/bm25.py
rename to utils/bm25.py
diff --git a/utils/call_azure.py b/utils/call_azure.py
new file mode 100644
index 0000000..b5ba317
--- /dev/null
+++ b/utils/call_azure.py
@@ -0,0 +1,47 @@
+import openai
+import yaml
+import os
+
+
+def call_GPT(function_prompt,model_name,function_type,function):
+    if function_type == "auto":
+        with open("./config/config_azure.yml", "r") as yaml_file:
+            config = yaml.safe_load(yaml_file)
+        openai.api_base = config["api_base"]
+        openai.api_type = config["api_type"]
+        openai.api_version = config["api_version"]
+        #openai.api_proxy = config["api_proxy"]
+        openai.api_key = config["openai_keys"][model_name][0]["api_key"]
+        try:
+            res = openai.ChatCompletion.create(
+                        engine=model_name,
+                        messages=[
+                            {"role": "user",
+                             "content": function_prompt}
+                        ], 
+                        functions = [function],
+                        function_call = "auto" ,
+                    )
+            return res
+        except Exception as e:
+            print("An exception occurred:", e)
+    elif function_type == "none":
+        with open("./config/config_azure.yml", "r") as yaml_file:
+            config = yaml.safe_load(yaml_file)
+        openai.api_base = config["api_base"]
+        openai.api_type = config["api_type"]
+        openai.api_version = config["api_version"]
+        #openai.api_proxy = config["api_proxy"]
+        openai.api_key = config["openai_keys"][model_name][0]["api_key"]
+        try:
+            res = openai.ChatCompletion.create(
+                        engine=model_name,
+                        messages=[
+                            {"role": "user",
+                             "content": function_prompt}
+                        ]
+                    )
+            return res
+        except Exception as e:
+            print("An exception occurred:", e)
+
diff --git a/utils/call_openai.py b/utils/call_openai.py
new file mode 100644
index 0000000..5c61f87
--- /dev/null
+++ b/utils/call_openai.py
@@ -0,0 +1,42 @@
+import openai
+import yaml
+import os
+
+def call_GPT(function_prompt,model_name,function_type,function):
+    if function_type == "auto":
+        with open("./config/config_openai.yml", "r") as yaml_file:
+            config = yaml.safe_load(yaml_file)
+        openai.api_base = config["api_base"]
+        openai.api_proxy = config["api_proxy"]
+        openai.api_key = config["openai_keys"][model_name][0]["api_key"]
+        try:
+            res = openai.ChatCompletion.create(
+                        model = model_name,
+                        messages = [
+                            {"role": "user",
+                             "content": function_prompt}
+                        ], 
+                        functions = [function],
+                        function_call = "auto" ,
+                    )
+            return res
+        except Exception as e:
+            print("An exception occurred:", e)
+    elif function_type == "none":
+        with open("./config/config_openai.yml", "r") as yaml_file:
+            config = yaml.safe_load(yaml_file)
+        openai.api_base = config["api_base"]
+        #openai.api_proxy = config["api_proxy"]
+        openai.api_key = config["openai_keys"][model_name][0]["api_key"]
+        try:
+            res = openai.ChatCompletion.create(
+                        model=model_name,
+                        messages=[
+                            {"role": "user",
+                             "content": function_prompt}
+                        ]
+                    )
+            return res
+        except Exception as e:
+            print("An exception occurred:", e)
+
diff --git a/script/tools/crawl.py b/utils/crawl.py
similarity index 100%
rename from script/tools/crawl.py
rename to utils/crawl.py
diff --git a/utils/generate_index.py b/utils/generate_index.py
new file mode 100644
index 0000000..11c1b16
--- /dev/null
+++ b/utils/generate_index.py
@@ -0,0 +1,10 @@
+import os
+
+def write_indexfile(repo_name,root_directory):
+    directory_path = root_directory
+    all_contents = [os.path.relpath(os.path.join(root, item), directory_path) for root, _, items in os.walk(directory_path) for item in items]
+    output_file = 'directory_contents.txt'
+    with open(repo_name+"_index.txt", 'w') as file:
+        file.write('\n'.join(all_contents) + '\n')
+    print(f'All directory contents written')
+
diff --git a/utils/get_args.py b/utils/get_args.py
new file mode 100644
index 0000000..3803847
--- /dev/null
+++ b/utils/get_args.py
@@ -0,0 +1,12 @@
+import argparse
+
+def Get_args():
+    parser = argparse.ArgumentParser(description="Please choose a model,api_type,function_call to use agent.")
+    parser.add_argument('--model_name', type=str, required=True, help="Model name")
+    parser.add_argument('--api_type', type=str, required=True, help="Api type")
+    parser.add_argument('--function_type', type=str, required=True, help="Function type:auto or none")
+    args = parser.parse_args()
+    model_name = args.model_name
+    api_type =args.api_type
+    function_type = args.function_type
+    return model_name,api_type,function_type
diff --git a/utils/keywords.py b/utils/keywords.py
new file mode 100644
index 0000000..f349d8c
--- /dev/null
+++ b/utils/keywords.py
@@ -0,0 +1,22 @@
+import re
+from tools.read_yml import read_yaml_file
+import json
+
+def get_keywords(query,model_name,api_type,function_type):
+    if api_type == "openai":
+        from tools.call_openai import call_GPT
+    elif api_type == "azure":
+        from tools.call_azure import call_GPT
+    function_file = "./functions/step1_function.yml"
+    function_prompt, function = read_yaml_file(function_file)
+    function_prompt = function_prompt.format(query)
+    response = call_GPT(function_prompt,model_name,function_type,function)
+    print(response)
+    function_call_message = response["choices"][0]["message"]["function_call"]
+    function_call_json = json.loads(json.dumps(function_call_message.to_dict()))
+    res_keywords = json.loads(function_call_json["arguments"])["keywords"]
+    keywords = res_keywords.split(', ')
+    return keywords
+
+
+
diff --git a/utils/read_yml.py b/utils/read_yml.py
new file mode 100644
index 0000000..91d141f
--- /dev/null
+++ b/utils/read_yml.py
@@ -0,0 +1,13 @@
+import yaml
+
+def read_yaml_file(yaml_file_path):
+    try:
+        with open(yaml_file_path, 'r') as file:
+            data = yaml.safe_load(file)
+
+        function_prompt = data.get('function_prompt', '')
+        function = data.get('function', {})
+
+        return function_prompt, function
+    except Exception as e:
+        return None, None
\ No newline at end of file
diff --git a/utils/readme.py b/utils/readme.py
new file mode 100644
index 0000000..e571057
--- /dev/null
+++ b/utils/readme.py
@@ -0,0 +1,10 @@
+import os
+
+def find_readme_files(directory):
+    readme_files = []
+    for root, dirs, files in os.walk(directory):
+        for file in files:
+            if file.lower() == "readme.md":
+                readme_files.append(os.path.join(root, file))
+    readme_files =  sorted(readme_files, key=len)
+    return readme_files 
diff --git a/utils/repo.py b/utils/repo.py
new file mode 100644
index 0000000..2c54021
--- /dev/null
+++ b/utils/repo.py
@@ -0,0 +1,25 @@
+from tools import keywords
+import requests
+
+def search_github_repositories_by_keywords(keywords):
+    query_string = '+'.join(keywords)
+
+    print(query_string)
+    api_url = f"https://api.github.com/search/repositories?q={query_string}&page=1&per_page=10"
+
+    response = requests.get(api_url)
+
+    if response.status_code == 200:
+        search_results = response.json()
+        
+        repo_urls = [repo["html_url"] for repo in search_results["items"]]
+        return repo_urls
+    else:
+        return []
+
+def get_repo_urls(query,model_name,api_type,function_type):
+    keywds = keywords.get_keywords(query,model_name,api_type,function_type)
+    print(keywds)
+    repo_urls = search_github_repositories_by_keywords(keywds)
+    return repo_urls
+    
diff --git a/utils/repo_description.py b/utils/repo_description.py
new file mode 100644
index 0000000..9dc6475
--- /dev/null
+++ b/utils/repo_description.py
@@ -0,0 +1,19 @@
+import os
+import requests
+
+def get_repo_description(repo_url):
+    parts = repo_url.strip('/').split('/')
+    if len(parts) != 5 or parts[2] != 'github.com':
+        return "Invalid GitHub repo URL"
+    username, repository_name = parts[3], parts[4]
+
+    api_url = f"https://api.github.com/repos/{username}/{repository_name}"
+
+    response = requests.get(api_url)
+
+    if response.status_code == 200:
+        repo_info = response.json()
+        repo_description = repo_info["description"]
+        return repo_description
+    else:
+        return "Unable to get warehouse information"
\ No newline at end of file
diff --git a/utils/repo_name.py b/utils/repo_name.py
new file mode 100644
index 0000000..c748291
--- /dev/null
+++ b/utils/repo_name.py
@@ -0,0 +1,5 @@
+import os
+
+def get_repo_name(repo_url):
+    repo_name = os.path.basename(repo_url.rstrip('/'))
+    return repo_name
\ No newline at end of file