Merge pull request #176 from princeton-nlp/rename-inference

Move inference to swebench.inference sub-package
princeton-nlp · Jul 10, 2024 · d44d366 · d44d366
2 parents a39c279 + 60c88b8
commit d44d366
Show file tree

Hide file tree

Showing 20 changed files with 58 additions and 44 deletions.
diff --git a/README.md b/README.md
@@ -99,9 +99,9 @@ python -m swebench.harness.run_evaluation --help
 ```
 
 Additionally, the SWE-Bench repo can help you:
-* Train your own models on our pre-processed datasets  
-* Run [inference](https://github.com/princeton-nlp/SWE-bench/blob/main/inference/) on existing models (either models you have on-disk like LLaMA, or models you have access to through an API like GPT-4). The inference step is where you get a repo and an issue and have the model try to generate a fix for it.
-*  Run SWE-bench's [data collection procedure](https://github.com/princeton-nlp/SWE-bench/blob/main/swebench/collect/) on your own repositories, to make new SWE-Bench tasks. 
+* Train your own models on our pre-processed datasets
+* Run [inference](https://github.com/princeton-nlp/SWE-bench/blob/main/swebench/inference/README.md) on existing models (either models you have on-disk like LLaMA, or models you have access to through an API like GPT-4). The inference step is where you get a repo and an issue and have the model try to generate a fix for it.
+*  Run SWE-bench's [data collection procedure](https://github.com/princeton-nlp/SWE-bench/blob/main/swebench/collect/) on your own repositories, to make new SWE-Bench tasks.
 
 ## ⬇️ Downloads
 | Datasets | Models |

diff --git a/setup.py b/setup.py
@@ -41,5 +41,32 @@
         'unidiff',
         'tqdm',
     ],
+    extras_require={
+        'inference': [
+            'tiktoken',
+            'openai',
+            'anthropic',
+            'transformers',
+            'peft',
+            'sentencepiece',
+            'protobuf',
+            'torch',
+            'flash_attn',
+            'triton',
+        ],
+        'retrieval': [
+            'tiktoken',
+            'openai',
+            'anthropic',
+            'transformers',
+            'peft',
+            'sentencepiece',
+            'protobuf',
+            'torch',
+            'flash_attn',
+            'triton',
+            'pyserini',
+        ],
+    },
     include_package_data=True,
 )
diff --git a/inference/README.md → swebench/inference/README.md b/inference/README.md → swebench/inference/README.md
@@ -17,7 +17,7 @@ This python script is designed to run inference on a dataset using either the Op
 For instance, to run this script on SWE-bench with the ``Oracle`` context and Anthropic's Claude 2 model, you can run the following command:
 ```bash
 export ANTHROPIC_API_KEY=<your key>
-python run_api.py --dataset_name_or_path princeton-nlp/SWE-bench_oracle --model_name_or_path claude-2 --output_dir ./outputs
+python -m swebench.inference.run_api --dataset_name_or_path princeton-nlp/SWE-bench_oracle --model_name_or_path claude-2 --output_dir ./outputs
 ```
 
 You can also specify further options:
@@ -35,7 +35,11 @@ This script is similar to `run_api.py`, but it is designed to run inference usin
 
 For instance, to run this script on SWE-bench with the ``Oracle`` context and SWE-Llama, you can run the following command:
 ```bash
-python run_llama.py --dataset_path princeton-nlp/SWE-bench_oracle --model_name_or_path princeton-nlp/SWE-Llama-13b --output_dir ./outputs --temperature 0
+python -m swebench.inference.run_llama \
+    --dataset_path princeton-nlp/SWE-bench_oracle \
+    --model_name_or_path princeton-nlp/SWE-Llama-13b \
+    --output_dir ./outputs \
+    --temperature 0
 ```
 
 You can also specify further options:
@@ -54,6 +58,6 @@ Then run `run_live.py` to try solving a new issue. For example, you can try solv
 
 ```bash
 export OPENAI_API_KEY=<your key>
-python run_live.py --model_name gpt-3.5-turbo-1106 \
+python -m swebench.inference.run_live --model_name gpt-3.5-turbo-1106 \
     --issue_url https://github.com/huggingface/transformers/issues/26706 
 ```
diff --git a/inference/__init__.py → swebench/inference/__init__.py b/inference/__init__.py → swebench/inference/__init__.py
diff --git a/inference/codellama_device_maps.json → ...ench/inference/codellama_device_maps.json b/inference/codellama_device_maps.json → ...ench/inference/codellama_device_maps.json
diff --git a/inference/environment.yml → swebench/inference/environment.yml b/inference/environment.yml → swebench/inference/environment.yml
diff --git a/inference/llamao/__init__.py → swebench/inference/llamao/__init__.py b/inference/llamao/__init__.py → swebench/inference/llamao/__init__.py
diff --git a/inference/llamao/distributed_attention.py → ...inference/llamao/distributed_attention.py b/inference/llamao/distributed_attention.py → ...inference/llamao/distributed_attention.py
diff --git a/inference/llamao/modeling_flash_llama.py → .../inference/llamao/modeling_flash_llama.py b/inference/llamao/modeling_flash_llama.py → .../inference/llamao/modeling_flash_llama.py
@@ -33,7 +33,7 @@
 from transformers.utils import logging
 from transformers.models.llama.configuration_llama import LlamaConfig
 
-from llamao.distributed_attention import DistributedAttention
+from swebench.inference.llamao.distributed_attention import DistributedAttention
 from flash_attn import flash_attn_kvpacked_func, flash_attn_varlen_kvpacked_func
 from flash_attn.bert_padding import unpad_input, pad_input
 

diff --git a/inference/make_datasets/README.md → swebench/inference/make_datasets/README.md b/inference/make_datasets/README.md → swebench/inference/make_datasets/README.md
diff --git a/inference/make_datasets/__init__.py → swebench/inference/make_datasets/__init__.py b/inference/make_datasets/__init__.py → swebench/inference/make_datasets/__init__.py
diff --git a/inference/make_datasets/bm25_retrieval.py → ...inference/make_datasets/bm25_retrieval.py b/inference/make_datasets/bm25_retrieval.py → ...inference/make_datasets/bm25_retrieval.py
@@ -14,12 +14,7 @@
 from tqdm.auto import tqdm
 from argparse import ArgumentParser
 
-try:
-    from utils import list_files
-    from utils import string_to_bool
-except:
-    from .utils import list_files
-    from .utils import string_to_bool
+from swebench.inference.make_datasets.utils import list_files, string_to_bool
 
 import logging
 

diff --git a/inference/make_datasets/create_instance.py → ...nference/make_datasets/create_instance.py b/inference/make_datasets/create_instance.py → ...nference/make_datasets/create_instance.py
@@ -8,12 +8,8 @@
 import unidiff
 from tqdm.auto import tqdm
 
-try:
-    from tokenize_dataset import TOKENIZER_FUNCS
-    from utils import AutoContextManager, ingest_directory_contents
-except:
-    from .tokenize_dataset import TOKENIZER_FUNCS
-    from .utils import AutoContextManager, ingest_directory_contents
+from swebench.inference.make_datasets.tokenize_dataset import TOKENIZER_FUNCS
+from swebench.inference.make_datasets.utils import AutoContextManager, ingest_directory_contents
 
 logging.basicConfig(level=logging.INFO, format="%(asctime)s %(levelname)s %(message)s")
 logger = logging.getLogger(__name__)

diff --git a/...ence/make_datasets/create_text_dataset.py → ...ence/make_datasets/create_text_dataset.py b/...ence/make_datasets/create_text_dataset.py → ...ence/make_datasets/create_text_dataset.py
@@ -12,14 +12,9 @@
 from datasets import Dataset, DatasetDict, load_dataset, load_from_disk
 from tqdm.auto import tqdm
 
-try:
-    from create_instance import PROMPT_FUNCTIONS, add_text_inputs
-    from tokenize_dataset import TOKENIZER_FUNCS
-    from utils import string_to_bool
-except:
-    from .create_instance import PROMPT_FUNCTIONS, add_text_inputs
-    from .tokenize_dataset import TOKENIZER_FUNCS
-    from .utils import string_to_bool
+from swebench.inference.make_datasets.create_instance import add_text_inputs, PROMPT_FUNCTIONS
+from swebench.inference.make_datasets.tokenize_dataset import TOKENIZER_FUNCS
+from swebench.inference.make_datasets.utils import string_to_bool
 
 logging.basicConfig(level=logging.INFO, format="%(asctime)s %(levelname)s %(message)s")
 logger = logging.getLogger(__name__)

diff --git a/inference/make_datasets/eval_retrieval.py → ...inference/make_datasets/eval_retrieval.py b/inference/make_datasets/eval_retrieval.py → ...inference/make_datasets/eval_retrieval.py
diff --git a/inference/make_datasets/tokenize_dataset.py → ...ference/make_datasets/tokenize_dataset.py b/inference/make_datasets/tokenize_dataset.py → ...ference/make_datasets/tokenize_dataset.py
diff --git a/inference/make_datasets/utils.py → swebench/inference/make_datasets/utils.py b/inference/make_datasets/utils.py → swebench/inference/make_datasets/utils.py
diff --git a/inference/run_api.py → swebench/inference/run_api.py b/inference/run_api.py → swebench/inference/run_api.py
@@ -21,7 +21,7 @@
     wait_random_exponential,
 )
 from datasets import load_dataset, load_from_disk
-from make_datasets.utils import extract_diff
+from swebench.inference.make_datasets.utils import extract_diff
 from argparse import ArgumentParser
 import logging
 

diff --git a/inference/run_live.py → swebench/inference/run_live.py b/inference/run_live.py → swebench/inference/run_live.py
@@ -15,20 +15,20 @@
 import time
 from datetime import datetime
 from tqdm.auto import tqdm
-from make_datasets.utils import ContextManager, string_to_bool, extract_diff, extract_minimal_patch
-from make_datasets.bm25_retrieval import (
-    make_index,
-    clone_repo,
-    search,
-    DOCUMENT_ENCODING_FUNCTIONS,
-)
-from make_datasets.create_instance import (
+from swebench.inference.make_datasets.utils import ContextManager, string_to_bool, extract_diff, extract_minimal_patch
+from swebench.inference.make_datasets.create_instance import (
     PROMPT_FUNCTIONS,
     TOKENIZER_FUNCS,
     make_code_text,
     ingest_files,
 )
-from run_api import call_chat, call_anthropic
+from swebench.inference.make_datasets.bm25_retrieval import (
+    make_index,
+    clone_repo,
+    search,
+    DOCUMENT_ENCODING_FUNCTIONS,
+)
+from swebench.inference.run_api import call_chat, call_anthropic
 import logging
 from argparse import ArgumentParser
 

diff --git a/inference/run_llama.py → swebench/inference/run_llama.py b/inference/run_llama.py → swebench/inference/run_llama.py
@@ -14,17 +14,14 @@
     StoppingCriteria,
     StoppingCriteriaList,
 )
-
-from llamao.modeling_flash_llama import LlamaForCausalLM as AutoModelForCausalLM
-from make_datasets.utils import extract_diff
+from pathlib import Path
+from swebench.inference.llamao.modeling_flash_llama import LlamaForCausalLM as AutoModelForCausalLM
+from swebench.inference.make_datasets.utils import extract_diff
 
 logging.basicConfig(level=logging.INFO, format="%(asctime)s %(levelname)s %(message)s")
 logger = logging.getLogger(__name__)
 
-DEVICE_MAPS = json.load(open("codellama_device_maps.json"))
-
-
-from pathlib import Path
+DEVICE_MAPS = json.load(open(Path(__file__).parent / "codellama_device_maps.json")) 
 
 def get_output_file(
     output_dir,