diff --git a/README.md b/README.md index 03ca1143..52bde40f 100644 --- a/README.md +++ b/README.md @@ -99,9 +99,9 @@ python -m swebench.harness.run_evaluation --help ``` Additionally, the SWE-Bench repo can help you: -* Train your own models on our pre-processed datasets -* Run [inference](https://github.com/princeton-nlp/SWE-bench/blob/main/inference/) on existing models (either models you have on-disk like LLaMA, or models you have access to through an API like GPT-4). The inference step is where you get a repo and an issue and have the model try to generate a fix for it. -* Run SWE-bench's [data collection procedure](https://github.com/princeton-nlp/SWE-bench/blob/main/swebench/collect/) on your own repositories, to make new SWE-Bench tasks. +* Train your own models on our pre-processed datasets +* Run [inference](https://github.com/princeton-nlp/SWE-bench/blob/main/swebench/inference/README.md) on existing models (either models you have on-disk like LLaMA, or models you have access to through an API like GPT-4). The inference step is where you get a repo and an issue and have the model try to generate a fix for it. +* Run SWE-bench's [data collection procedure](https://github.com/princeton-nlp/SWE-bench/blob/main/swebench/collect/) on your own repositories, to make new SWE-Bench tasks. ## ⬇️ Downloads | Datasets | Models | diff --git a/setup.py b/setup.py index fa39424c..b0c29408 100644 --- a/setup.py +++ b/setup.py @@ -41,5 +41,32 @@ 'unidiff', 'tqdm', ], + extras_require={ + 'inference': [ + 'tiktoken', + 'openai', + 'anthropic', + 'transformers', + 'peft', + 'sentencepiece', + 'protobuf', + 'torch', + 'flash_attn', + 'triton', + ], + 'retrieval': [ + 'tiktoken', + 'openai', + 'anthropic', + 'transformers', + 'peft', + 'sentencepiece', + 'protobuf', + 'torch', + 'flash_attn', + 'triton', + 'pyserini', + ], + }, include_package_data=True, ) \ No newline at end of file diff --git a/inference/README.md b/swebench/inference/README.md similarity index 87% rename from inference/README.md rename to swebench/inference/README.md index 1ad7543a..ffeb5882 100644 --- a/inference/README.md +++ b/swebench/inference/README.md @@ -17,7 +17,7 @@ This python script is designed to run inference on a dataset using either the Op For instance, to run this script on SWE-bench with the ``Oracle`` context and Anthropic's Claude 2 model, you can run the following command: ```bash export ANTHROPIC_API_KEY= -python run_api.py --dataset_name_or_path princeton-nlp/SWE-bench_oracle --model_name_or_path claude-2 --output_dir ./outputs +python -m swebench.inference.run_api --dataset_name_or_path princeton-nlp/SWE-bench_oracle --model_name_or_path claude-2 --output_dir ./outputs ``` You can also specify further options: @@ -35,7 +35,11 @@ This script is similar to `run_api.py`, but it is designed to run inference usin For instance, to run this script on SWE-bench with the ``Oracle`` context and SWE-Llama, you can run the following command: ```bash -python run_llama.py --dataset_path princeton-nlp/SWE-bench_oracle --model_name_or_path princeton-nlp/SWE-Llama-13b --output_dir ./outputs --temperature 0 +python -m swebench.inference.run_llama \ + --dataset_path princeton-nlp/SWE-bench_oracle \ + --model_name_or_path princeton-nlp/SWE-Llama-13b \ + --output_dir ./outputs \ + --temperature 0 ``` You can also specify further options: @@ -54,6 +58,6 @@ Then run `run_live.py` to try solving a new issue. For example, you can try solv ```bash export OPENAI_API_KEY= -python run_live.py --model_name gpt-3.5-turbo-1106 \ +python -m swebench.inference.run_live --model_name gpt-3.5-turbo-1106 \ --issue_url https://github.com/huggingface/transformers/issues/26706 ``` diff --git a/inference/__init__.py b/swebench/inference/__init__.py similarity index 100% rename from inference/__init__.py rename to swebench/inference/__init__.py diff --git a/inference/codellama_device_maps.json b/swebench/inference/codellama_device_maps.json similarity index 100% rename from inference/codellama_device_maps.json rename to swebench/inference/codellama_device_maps.json diff --git a/inference/environment.yml b/swebench/inference/environment.yml similarity index 100% rename from inference/environment.yml rename to swebench/inference/environment.yml diff --git a/inference/llamao/__init__.py b/swebench/inference/llamao/__init__.py similarity index 100% rename from inference/llamao/__init__.py rename to swebench/inference/llamao/__init__.py diff --git a/inference/llamao/distributed_attention.py b/swebench/inference/llamao/distributed_attention.py similarity index 100% rename from inference/llamao/distributed_attention.py rename to swebench/inference/llamao/distributed_attention.py diff --git a/inference/llamao/modeling_flash_llama.py b/swebench/inference/llamao/modeling_flash_llama.py similarity index 99% rename from inference/llamao/modeling_flash_llama.py rename to swebench/inference/llamao/modeling_flash_llama.py index faf89fe8..0e6078d6 100644 --- a/inference/llamao/modeling_flash_llama.py +++ b/swebench/inference/llamao/modeling_flash_llama.py @@ -33,7 +33,7 @@ from transformers.utils import logging from transformers.models.llama.configuration_llama import LlamaConfig -from llamao.distributed_attention import DistributedAttention +from swebench.inference.llamao.distributed_attention import DistributedAttention from flash_attn import flash_attn_kvpacked_func, flash_attn_varlen_kvpacked_func from flash_attn.bert_padding import unpad_input, pad_input diff --git a/inference/make_datasets/README.md b/swebench/inference/make_datasets/README.md similarity index 100% rename from inference/make_datasets/README.md rename to swebench/inference/make_datasets/README.md diff --git a/inference/make_datasets/__init__.py b/swebench/inference/make_datasets/__init__.py similarity index 100% rename from inference/make_datasets/__init__.py rename to swebench/inference/make_datasets/__init__.py diff --git a/inference/make_datasets/bm25_retrieval.py b/swebench/inference/make_datasets/bm25_retrieval.py similarity index 99% rename from inference/make_datasets/bm25_retrieval.py rename to swebench/inference/make_datasets/bm25_retrieval.py index a8e9af67..1eb79bc5 100644 --- a/inference/make_datasets/bm25_retrieval.py +++ b/swebench/inference/make_datasets/bm25_retrieval.py @@ -14,12 +14,7 @@ from tqdm.auto import tqdm from argparse import ArgumentParser -try: - from utils import list_files - from utils import string_to_bool -except: - from .utils import list_files - from .utils import string_to_bool +from swebench.inference.make_datasets.utils import list_files, string_to_bool import logging diff --git a/inference/make_datasets/create_instance.py b/swebench/inference/make_datasets/create_instance.py similarity index 98% rename from inference/make_datasets/create_instance.py rename to swebench/inference/make_datasets/create_instance.py index 7688d3ac..b5109481 100644 --- a/inference/make_datasets/create_instance.py +++ b/swebench/inference/make_datasets/create_instance.py @@ -8,12 +8,8 @@ import unidiff from tqdm.auto import tqdm -try: - from tokenize_dataset import TOKENIZER_FUNCS - from utils import AutoContextManager, ingest_directory_contents -except: - from .tokenize_dataset import TOKENIZER_FUNCS - from .utils import AutoContextManager, ingest_directory_contents +from swebench.inference.make_datasets.tokenize_dataset import TOKENIZER_FUNCS +from swebench.inference.make_datasets.utils import AutoContextManager, ingest_directory_contents logging.basicConfig(level=logging.INFO, format="%(asctime)s %(levelname)s %(message)s") logger = logging.getLogger(__name__) diff --git a/inference/make_datasets/create_text_dataset.py b/swebench/inference/make_datasets/create_text_dataset.py similarity index 96% rename from inference/make_datasets/create_text_dataset.py rename to swebench/inference/make_datasets/create_text_dataset.py index 838d9f32..48ee5e7d 100755 --- a/inference/make_datasets/create_text_dataset.py +++ b/swebench/inference/make_datasets/create_text_dataset.py @@ -12,14 +12,9 @@ from datasets import Dataset, DatasetDict, load_dataset, load_from_disk from tqdm.auto import tqdm -try: - from create_instance import PROMPT_FUNCTIONS, add_text_inputs - from tokenize_dataset import TOKENIZER_FUNCS - from utils import string_to_bool -except: - from .create_instance import PROMPT_FUNCTIONS, add_text_inputs - from .tokenize_dataset import TOKENIZER_FUNCS - from .utils import string_to_bool +from swebench.inference.make_datasets.create_instance import add_text_inputs, PROMPT_FUNCTIONS +from swebench.inference.make_datasets.tokenize_dataset import TOKENIZER_FUNCS +from swebench.inference.make_datasets.utils import string_to_bool logging.basicConfig(level=logging.INFO, format="%(asctime)s %(levelname)s %(message)s") logger = logging.getLogger(__name__) diff --git a/inference/make_datasets/eval_retrieval.py b/swebench/inference/make_datasets/eval_retrieval.py similarity index 100% rename from inference/make_datasets/eval_retrieval.py rename to swebench/inference/make_datasets/eval_retrieval.py diff --git a/inference/make_datasets/tokenize_dataset.py b/swebench/inference/make_datasets/tokenize_dataset.py similarity index 100% rename from inference/make_datasets/tokenize_dataset.py rename to swebench/inference/make_datasets/tokenize_dataset.py diff --git a/inference/make_datasets/utils.py b/swebench/inference/make_datasets/utils.py similarity index 100% rename from inference/make_datasets/utils.py rename to swebench/inference/make_datasets/utils.py diff --git a/inference/run_api.py b/swebench/inference/run_api.py similarity index 99% rename from inference/run_api.py rename to swebench/inference/run_api.py index 90628886..42d90962 100755 --- a/inference/run_api.py +++ b/swebench/inference/run_api.py @@ -21,7 +21,7 @@ wait_random_exponential, ) from datasets import load_dataset, load_from_disk -from make_datasets.utils import extract_diff +from swebench.inference.make_datasets.utils import extract_diff from argparse import ArgumentParser import logging diff --git a/inference/run_live.py b/swebench/inference/run_live.py similarity index 96% rename from inference/run_live.py rename to swebench/inference/run_live.py index b947c088..e5fa7c84 100755 --- a/inference/run_live.py +++ b/swebench/inference/run_live.py @@ -15,20 +15,20 @@ import time from datetime import datetime from tqdm.auto import tqdm -from make_datasets.utils import ContextManager, string_to_bool, extract_diff, extract_minimal_patch -from make_datasets.bm25_retrieval import ( - make_index, - clone_repo, - search, - DOCUMENT_ENCODING_FUNCTIONS, -) -from make_datasets.create_instance import ( +from swebench.inference.make_datasets.utils import ContextManager, string_to_bool, extract_diff, extract_minimal_patch +from swebench.inference.make_datasets.create_instance import ( PROMPT_FUNCTIONS, TOKENIZER_FUNCS, make_code_text, ingest_files, ) -from run_api import call_chat, call_anthropic +from swebench.inference.make_datasets.bm25_retrieval import ( + make_index, + clone_repo, + search, + DOCUMENT_ENCODING_FUNCTIONS, +) +from swebench.inference.run_api import call_chat, call_anthropic import logging from argparse import ArgumentParser diff --git a/inference/run_llama.py b/swebench/inference/run_llama.py similarity index 98% rename from inference/run_llama.py rename to swebench/inference/run_llama.py index fcc6bda5..b1850f24 100644 --- a/inference/run_llama.py +++ b/swebench/inference/run_llama.py @@ -14,17 +14,14 @@ StoppingCriteria, StoppingCriteriaList, ) - -from llamao.modeling_flash_llama import LlamaForCausalLM as AutoModelForCausalLM -from make_datasets.utils import extract_diff +from pathlib import Path +from swebench.inference.llamao.modeling_flash_llama import LlamaForCausalLM as AutoModelForCausalLM +from swebench.inference.make_datasets.utils import extract_diff logging.basicConfig(level=logging.INFO, format="%(asctime)s %(levelname)s %(message)s") logger = logging.getLogger(__name__) -DEVICE_MAPS = json.load(open("codellama_device_maps.json")) - - -from pathlib import Path +DEVICE_MAPS = json.load(open(Path(__file__).parent / "codellama_device_maps.json")) def get_output_file( output_dir,