diff --git a/.github/workflows/lint.yaml b/.github/workflows/lint.yaml index 986fdf4df..21e2486bb 100644 --- a/.github/workflows/lint.yaml +++ b/.github/workflows/lint.yaml @@ -27,6 +27,7 @@ jobs: - benchmarks/deeplab - benchmarks/resnet_imagenet - end-to-end-examples/sec_10k_qa + - end-to-end-examples/support-chatbot - end-to-end-examples/stable_diffusion - end-to-end-examples/stable_diffusion_dreambooth - third-party/nemo diff --git a/examples/end-to-end-examples/support_chatbot/README.md b/examples/end-to-end-examples/support_chatbot/README.md new file mode 100644 index 000000000..145b4f94e --- /dev/null +++ b/examples/end-to-end-examples/support_chatbot/README.md @@ -0,0 +1,179 @@ +## Overview + +In this tutorial, we will be creating an application that answers questions about the MosaicML composer codebase. The basic structure of this application will be a retrieval question answering system where the user will provide the chatbot with a question, and then a language model will answer the question based on the retrieved text. See some [great](https://python.langchain.com/en/latest/modules/chains/index_examples/vector_db_qa.html#retrieval-question-answering) [materials](https://blog.langchain.dev/langchain-chat/) from [LangChain](https://python.langchain.com/en/latest/index.html) for more exploration on this type of application. + +By default the model that is used throughout is [MPT-30b](https://huggingface.co/mosaicml/mpt-30b), a 30-billion parameter large language model trained by MosaicML. See [our blog](https://www.mosaicml.com/blog/mpt-30b) for more details. + +![demo](web_app_screenshot.png) + + +## Which MosaicML tools will we use? +- [LLM-foundry](https://github.com/mosaicml/llm-foundry): An open-source PyTorch library of tools for training, finetuning, evaluating, and deploying LLMs for inference. +- [Composer](https://github.com/mosaicml/composer): An open-source PyTorch library for easy large scale deep learning. +- [Streaming](https://github.com/mosaicml/streaming): An open-source PyTorch library for efficiently and accurately streaming data from the cloud to your training job, whereever that job is running. +- [MCLI](https://docs.mosaicml.com/projects/mcli/en/latest/): The command line interface for running training and inference jobs on the MosaicML platform. + + +## Setup + +Jobs can be submitted to the MosaicML platform either using the [SDK](https://docs.mosaicml.com/projects/mcli/en/latest/training/working_with_runs.html#manage-a-run-with-the-sdk), or MCLI yamls. All commands in this tutorial are going to be run using MCLI yamls. For more MosaicML platform documentation, see the [MosaicML documentation](https://docs.mosaicml.com/projects/mcli/en/latest/), and for a detailed explanation of our yamls, see [training yaml documentation](https://docs.mosaicml.com/projects/mcli/en/latest/training/yaml_schema.html) and [inference yaml documentation](https://docs.mosaicml.com/projects/mcli/en/latest/inference/inference_schema.html). For understanding how this works, and what is actually running, there are a few important details to understand. + +1) The MosaicML platform does not have permanent storage on the compute nodes. This means that all data will be streamed in/out to/from a cloud object store (please note that our deployment only works for s3 ,gs, and HuggingFace). In this tutorial we will use [MosaicML Streaming](https://github.com/mosaicml/streaming) to accomplish this. See [MosaicML setup](#mosaicml-platform-setup) for more details on setting up your cloud provider of choice. +1) The `command` section of the yaml is what will actually get run on the compute node. If you are trying to debug/run something locally, you should run what appears in the `command` section (after setting up your local environment). +1) The `parameters` section of the yaml is mounted as a single `.yaml` file at `/mnt/config/parameters.yaml`, which your `command` can then read from. This `parameters` section is how we will pass the training configuration parameters to the training script. + + +### MosaicML platform setup + +Before starting this tutorial, you should make sure that you have access to the MosaicML platform. You'll need access to both training and inference services to complete this tutorial, although you can follow this tutorial up to the [deployment](#6-deploy-your-model-and-an-embedding-model) section if you just have access to training. Please [reach out](https://forms.mosaicml.com/demo?utm_source=inference&utm_medium=mosaicml.com&utm_campaign=always-on) if you would like to sign up, and reach out if you are already a customer and need to gain access to either service. + +1. Go through the [getting started guide](https://docs.mosaicml.com/projects/mcli/en/latest/quick_start/getting_started.html). +1. Set up the object store of your choice, by following the [secrets guide](https://docs.mosaicml.com/projects/mcli/en/latest/resources/secrets/index.html) for your cloud provider of choice. +1. [Optional] Set up [Weights & Biases](https://docs.mosaicml.com/projects/mcli/en/latest/resources/integrations/wandb.html) or [CometML](https://docs.mosaicml.com/projects/mcli/en/latest/resources/integrations/comet.html) for tracking your experiments. + +Once you have done all of this, you should be ready to get started with this tutorial! + + +### Local setup + +All that is needed for local setup is to clone this repository and install a few dependencies, as the only thing you will be running locally is the final application. Everything else will be run through the MosaicML platform. +```bash +git clone https://github.com/mosaicml/examples +cd examples/examples/end-to-end-examples/support_chatbot +python -m venv examples-chatbot-venv +source examples-chatbot-venv/bin/activate +pip install -r requirements-cpu.txt +# Your api token can be found by running `mcli config`. This token is set as an environment variable for the langchain integration +export MOSAICML_API_TOKEN= +``` + + +## How to follow this tutorial + +Each section of this tutorial will have a command to run, which fields you need to edit before running the command, the expected input and output of that section, and a description of what is happening. The steps should be run sequentially, and you must wait for each prior step to complete before running the next one. The commands will be run using MCLI yamls (except the final front end which will run locally). All intermediate output will be written to cloud object store. Everywhere that a path is used, it will be of the form `CLOUD://BUCKET_NAME/path/to/my/folder/`. You will need to fill in the `CLOUD` (e.g. `s3`, `oci`, `gs`) and the `BUCKET_NAME` (e.g. `my-bucket`). The description of what is happening will be a high level overview of the steps that are being taken, and will not go into detail about the code, but the code and yamls will have detailed comments, and the description will contain pointers to where to learn more. We encourage you to read the yamls in detail to gain a better understanding of the various options that are available. + +## Step 1: Getting Our Data + +Let's first start with downloading the github repository that we want to finetune our model on so that it can get a basic understanding of the codebase. The [repo_downloader](./scripts/repo_downloader.py) will clone the git repository into a designated cloning directory where it will flatten the repository to be an assortment of text files in a local directory: `retrieval_data/{REPOSITORY_NAME}`. It will then erase the cloning directory. + +```bash +python scripts/repo_downloader.py REPO_LINKS +``` + +**Fields to replace with your values:** `REPO_LINKS` (in the command line). For instance, to download the MosaicML Composer repository, we run: +```bash +python scripts/repo_downloader.py https://github.com/mosaicml/composer +``` + +## Step 2: Converting to MDS + +As mentioned in the [MosaicML platform setup](#mosaicml-platform-setup) section, the MosaicML platform does not have permanent storage on the compute nodes. This means that all data will be streamed in and out from a cloud provider. In order to make this process as efficient as possible during a training run, we will convert the data into a format that is optimized for streaming, using our [Streaming](https://github.com/mosaicml/streaming) library. This format is called [MDS](https://docs.mosaicml.com/projects/streaming/en/stable/fundamentals/dataset_format.html#mds), and is a simple format that is optimized for streaming. + +This example will contain a script for a [text to MDS](./scripts/conversion/convert_txt_to_stream.py) conversion. For text to MDS conversion, we will run: + +```bash +mcli run -f mcli_yamls/conversion/convert_txt_to_stream.yaml --cluster CLUSTER +``` + +**The conversion YAMLs will not work on local directories if the data is not git pushed onto the repository linked by the YAML.** This means that it will not be able to recognize a remote path to data, but only a local path in the github repository (you can feed it the relative path from *step 1*). Thus, if you choose to use the YAML method, make sure that you push the data downloaded from *step* one to your repository. That being said, github repositories are typically very small and you can probably just run this locally: + +```bash +python scripts/conversion/convert_txt_to_stream.py \ + --out_root CLOUD://BUCKET/support-bot-demo/data/DATA_NAME_MDS/ \ + --in_root PATH_TO_TXT_FOLDER +``` + +**Fields to replace with your values:** `CLUSTER` (in the command line), `CLOUD` (in the yaml), `BUCKET` (in the yaml), 'DATA_NAME_MDS' (in the yaml), `PATH_TO_TXT_FOLDER` (in the yaml). Please note that PATH_TO_TXT_FOLDER **MUST** be a local directory (not an OCI link) due to some limitations from OCI. To follow with our previous example in Step 1, if we want to convert the folder containing all of composer as text, we will run with `DATA_NAME_MDS = composer_codebase_mds` and `PATH_TO_TXT_FOLDER = retrieval_data/composer` + +## Step 3: Finetuning on our Repository + +Next, we will finetune our pretrained model on the train split of our data for the end task of answering questions about the codebase. + +Please check out the [training directory](./mcli-yamls/finetune) for all of the details. This yaml will load the pretrained weights for `mpt-30b` available on the [HuggingFace Hub](https://huggingface.co/mosaicml/mpt-30b), and then train using the normal causal language modeling objective on our datasets that we processed in the previous step. The [training script](https://github.com/mosaicml/llm-foundry/blob/main/scripts/train/train.py) itself, is from LLM-foundry. + +To run finetuning on the composer codebase, run the following: + +```bash +mcli run -f mcli_yamls/finetune/finetune_composer_codebase.yaml --cluster CLUSTER +``` +**Fields to replace with your values:** `CLUSTER` (in the command line), `CLOUD` (in the yaml), `BUCKET_NAME` (in the yaml). + +**Inputs:** `composer_codebase_mds` cloud folder from step 2 + +**Outputs:** the checkpoints from your training, saved to the `save_folder` specified in the yaml + +## Step 4: Finetuning on Chatv2 + +Next, we will finetune our model on the train split of the chatv2 dataset to ensure that the bot still has its QA abilities. Chatv2 is the dataset that was used to finetune the MPT-30B model to be [MPT-30B-Chat](https://huggingface.co/mosaicml/mpt-30b-chat) and includes GPT generated output. To run finetuning on Chatv2, run the following: + +```bash +mcli run -f mcli_yamls/finetune/finetune_30b_chat.yaml --cluster CLUSTER +``` +**Fields to replace with your values:** `CLUSTER` (in the command line), `CLOUD` (in the yaml), `BUCKET_NAME` (in the yaml). + +**Outputs:** the checkpoints from your training, saved to the `save_folder` specified in the yaml + +## Step 5: Converting Composer Checkpoint to HuggingFace + +Before we can deploy our model, we need to convert it into the standard HuggingFace checkpoint folder. We will use the [conversion script](https://github.com/mosaicml/llm-foundry/blob/main/scripts/inference/convert_composer_to_hf.py) from LLM-foundry to do this. This script will take the Composer checkpoint, and write out all the files that HuggingFace expects in a checkpoint folder. You can additionally add the `--hf_repo_for_upload` argument if you would like to upload directly to a private repo on the HuggingFace Hub (you will also need to [set the `HUGGING_FACE_HUB_TOKEN` environment variable](https://docs.mosaicml.com/projects/mcli/en/latest/resources/secrets/env.html) to do this). + +**Fields to replace with your values:** `REPLACE_WITH_YOUR_CLUSTER` (in the command), `CLOUD` (in the yaml), `BUCKET_NAME` (in the yaml), `CHECKPOINT_FOLDER_NAME` (in the yaml), `HF_FOLDER_NAME` (in the yaml) + +**Inputs:** the final checkpoint from step 4 saved to the `save_folder` specified in the previous yaml and where you want the converted checkpoint to go in `hf_output_path` + +**Command:** +```bash +mcli run -f mcli-yamls/convert_checkpoint_to_huggingface.yaml --cluster REPLACE_WITH_YOUR_CLUSTER +``` + +**Outputs:** the ``CLOUD://BUCKET_NAME/support-bot-demo/converted_checkpoints/HF_FOLDER_NAME`` folder, containing the HuggingFace checkpoint files + +## Step 6: Deploy your model + +Now that we have our trained model, we will deploy it using MosaicML inference. This will allow us to use the model as an API. For more examples of inference deployments, see [inference-deployments](../../inference-deployments/) + + +**Fields to replace with your values:** `REPLACE_WITH_YOUR_CLUSTER` (in the command), `BUCKET_NAME` (in the yaml), `HF_FOLDER_NAME` (in the yaml)` + +**Inputs:** the HuggingFace format checkpoint from step 5 + +**Command**: +```bash +mcli deploy -f mcli-yamls/deploy_llm.yaml --cluster REPLACE_WITH_YOUR_CLUSTER +``` + +**Outputs:** A deployment for the language model + + +## Step 7: Application with gradio + +Now that we've processed our data, trained our models, and deployed our models, we can run the application! We will use Gradio and LangChain to make a simple question answering interface. + +We will make use of the MosaicML integration in LangChain for [LLMs](https://github.com/hwchase17/langchain/blob/master/langchain/llms/mosaicml.py) and [embeddings](https://github.com/hwchase17/langchain/blob/master/langchain/embeddings/mosaicml.py), and use the [`RetrievalQA`](https://python.langchain.com/en/latest/modules/chains/index_examples/vector_db_qa.html?highlight=retrievalqa) abstraction with the [`FAISS`](https://python.langchain.com/en/latest/modules/indexes/vectorstores/examples/faiss.html?highlight=faiss) to run the application locally. + +Upon the first run of this app, expect that it might take ~20 minutes to embed all of the data and store the vector store in `retrieval_data/vectors.pickle`. After the first run, unless you delete the pickle file, it will just reference the pickle file and not re-embed. + +Play around with the application and imagine ways you could improve it or apply a similar approach to your data! + +You can find the names of your deployments by running `mcli get deployments`. + +After running the `python` command, you should see a link to your application. It is the link after `Running on local URL:`, _not_ the url after `Launching in *reload mode* on:`. + +**Command**: +```bash +python app.py +``` + +What this will do is combine everything together to create your chatbot! With default args, it downloads all of the public MosaicML repositories as well as uses a [web_downloader.py](./web_downloader.py) to download all of the MosaicML Docs. After, it will embed the data to create a vector store that LangChain can run similarity searches on to provide the model with context when answering questions. + +Please note that you can customize this app by updating the parser args. Please see what you can customize in [`app_demo.py`](./app_demo.py) + + +## What next? + +Now that you've seen how to use MosaicML to train and deploy a language model, you can try it out on your own data! Here are some ideas for where to go from here: +- Play around with the hyperparameters and prompts for all of the components in [`app.py`](./app.py) and see how they change the output +- Try out different models from the HuggingFace Hub, both for the text embedding and for the LLM +- Try changing optimization parameters in the training yamls to see how they affect the training +- Try swapping in a new dataset, or applying the models to a new task +- Read more about the MosaicML components in this tutorial ([LLM-foundry](https://github.com/mosaicml/llm-foundry), [Composer](https://docs.mosaicml.com/projects/composer/en/latest/), [Streaming](https://docs.mosaicml.com/projects/streaming/en/latest/), [MCLI](https://docs.mosaicml.com/projects/mcli/en/latest/)) diff --git a/examples/end-to-end-examples/support_chatbot/__init__.py b/examples/end-to-end-examples/support_chatbot/__init__.py new file mode 100644 index 000000000..e69de29bb diff --git a/examples/end-to-end-examples/support_chatbot/app_demo.py b/examples/end-to-end-examples/support_chatbot/app_demo.py new file mode 100644 index 000000000..c022cb681 --- /dev/null +++ b/examples/end-to-end-examples/support_chatbot/app_demo.py @@ -0,0 +1,210 @@ +from argparse import ArgumentParser, Namespace +import gradio as gr +from langchain.embeddings import MosaicMLInstructorEmbeddings +from langchain.llms import MosaicML +from chatbot import ChatBot +import os + +ROOT_DIR = os.path.dirname(os.path.abspath(__file__)) + +EVAL_7B_TEMPLATE = (f'Answer the following question as one function, class, or object. If you do not know, just say "I do not know".' + '\n{context}' + '\nQuestion: {question}') + +EVAL_30B_TEMPLATE = ("""<|im_start|>system + A conversation between a user and an LLM-based AI assistant about the codebase for the MosaicML library Composer. + Provide a helpful and simple answer given the following context to the question. If you do not know, just say "I + do not know".<|im_end|> + <|im_start|>context + {context}<|im_end|> + <|im_start|>user + {question}<|im_end|> + <|im_start|>assistant""") + +def parse_args() -> Namespace: + """Parse commandline arguments.""" + parser = ArgumentParser( + description= + 'Run a chatbot!' + ) + parser.add_argument( + '--endpoint_url', + type=str, + default='https://models.hosted-on.mosaicml.hosting/mpt-30b-chat/v1/predict', + required=False, + help='The endpoint of our MosaicML LLM Model') + parser.add_argument( + '--model_name', + type=str, + default='mpt-30b-chat', + required=False, + help='The model name (mpt-30b-chat or mpt-7b) that determines which prompt template to use when evaluating') + parser.add_argument( + '--max_length', + type=int, + default=5000, + required=False, + help='The maximum number tokens of both input and output of the model (it will cut off output if token exceeds this length)') + parser.add_argument( + '--chunk_size', + type=int, + default=800, + required=False, + help='The chunk size when splitting documents') + parser.add_argument( + '--chunk_overlap', + type=int, + default=400, + required=False, + help='The overlap between chunks when splitting documents') + parser.add_argument( + '--retrieval_k', + type=int, + default=5, + required=False, + help='The number of chunks to retrieve as context from vector store') + parser.add_argument( + '--top_k', + type=int, + default=10, + required=False, + help='The number of highest probability vocabulary tokens to keep for top-k-filtering') + parser.add_argument( + '--repository_urls', + type=str, + nargs='*', + default=['https://github.com/mosaicml/composer', + 'https://github.com/mosaicml/streaming', + 'https://github.com/mosaicml/examples', + 'https://github.com/mosaicml/diffusion', + 'https://github.com/mosaicml/llm-foundry'], + required=False, + help='The GitHub repository URLs to download' + ) + parser.add_argument( + '--complex_data_dir', + type=str, + required=False, + default=os.path.join(ROOT_DIR, 'eval_data/complex_eval.jsonl'), + help='complex eval data for human eval') + parser.add_argument( + '--simple_data_dir', + type=str, + required=False, + default=os.path.join(ROOT_DIR, 'eval_data/composer_docstrings.jsonl'), + help='simple eval data for string comparison') + parser.add_argument( + '--complex_chat', + type=int, + default=0, + required=False, + help='Which version of chatting to use (0 for normal, 1 for sub-query, 2 for relation sub-query) Each version is an improvement on the previous though increases response time.') + + parsed = parser.parse_args() + + if parsed.repository_urls is not None: + # Remove whitespace and turn URLs into a list + parsed.repository_urls = ''.join(str(parsed.repository_urls).split()).split(',') + + return parsed + +def main(endpoint_url: str, + model_name: str, + max_length: int, + chunk_size: int, + chunk_overlap: int, + retrieval_k: int, + top_k: int, + repository_urls: list[str], + complex_data_dir: str, + simple_data_dir: str, + chat_version: int) -> None: + + retrieval_dir = os.path.join(ROOT_DIR, 'retrieval_data') + + embeddings = MosaicMLInstructorEmbeddings() + llm = MosaicML( + inject_instruction_format=True, + endpoint_url= endpoint_url, + model_kwargs={ + 'max_length': max_length, + 'top_k': top_k, + 'top_p': 0.95, + 'temperature': 0.1, + # other HuggingFace generation parameters can be set as kwargs here to experiment with different decoding parameters + }, + ) + + chatbot = ChatBot(data_path= retrieval_dir, + embedding=embeddings, + model=llm, + k=retrieval_k, + chunk_size=chunk_size, + chunk_overlap=chunk_overlap) + + if not chatbot.vector_store: + print("can't find vectors.pickle, loading it") + if repository_urls is None: + raise ValueError('No repository URLs provided. Please provide a comma separated list of URLs to download') + chatbot.create_vector_store(repository_urls=repository_urls) + + def chat_wrapper(query: str) -> str: + """Wrapper around chatbot.chat() for gradio + Args: + query (str): The query to chatbot + + Returns: + str: The response from chatbot""" + if query == '!eval_simple': + if simple_data_dir is None: + ValueError('No simple data directory provided. Please provide a directory with simple eval data') + if model_name == 'mpt-30b-chat': + return chatbot.evaluate_simple(simple_data_dir, EVAL_30B_TEMPLATE) + elif model_name == 'mpt-7b': + return chatbot.evaluate_simple(simple_data_dir, EVAL_7B_TEMPLATE) + + elif query == '!eval_complex': + if complex_data_dir is None: + ValueError('No complex data directory provided. Please provide a directory with complex eval data') + if model_name == 'mpt-30b-chat': + return chatbot.evaluate_complex(complex_data_dir, EVAL_30B_TEMPLATE) + elif model_name == 'mpt-7b': + return chatbot.evaluate_complex(complex_data_dir, EVAL_7B_TEMPLATE) + + if chat_version == 1: + return chatbot.sub_query_chat(query) + elif chat_version == 2: + return chatbot.relation_sub_query_chat(query) + else: + return chatbot.chat(query) + + def gradio_chat(): + """Simple gradio application for querying the model""" + with gr.Blocks() as demo: + query = gr.Textbox(label='Query', + value='What is AliBi?') + answer = gr.Textbox(label='Answer') + query_btn = gr.Button('Query') + query_btn.click(fn=chat_wrapper, + inputs=[query], + outputs=[answer]) + demo.queue() + demo.launch() + + gradio_chat() + +if __name__ == "__main__": + args = parse_args() + main( + endpoint_url=args.endpoint_url, + model_name=args.model_name, + max_length = args.max_length, + chunk_size = args.chunk_size, + chunk_overlap = args.chunk_overlap, + retrieval_k = args.retrieval_k, + top_k = args.top_k, + repository_urls = args.repository_urls, + complex_data_dir = args.complex_data_dir, + simple_data_dir = args.simple_data_dir, + chat_version = args.complex_chat + ) \ No newline at end of file diff --git a/examples/end-to-end-examples/support_chatbot/chatbot.py b/examples/end-to-end-examples/support_chatbot/chatbot.py new file mode 100644 index 000000000..8d698474a --- /dev/null +++ b/examples/end-to-end-examples/support_chatbot/chatbot.py @@ -0,0 +1,565 @@ +import os +import json +import re +import string +import time +from tqdm import tqdm + +import langchain +from langchain.document_loaders import UnstructuredFileLoader +from langchain.text_splitter import RecursiveCharacterTextSplitter +import pickle +from langchain.prompts import PromptTemplate +from langchain.vectorstores import FAISS +from langchain.chains import LLMChain, RetrievalQA +from langchain.chains.combine_documents.stuff import StuffDocumentsChain +from langchain.schema import Document, BaseRetriever +import sys + +ROOT_DIR = os.path.dirname(os.path.abspath(__file__)) +sys.path.append(ROOT_DIR) +from repo_downloader import RepoDownloader +from web_downloader import WebScraper + +class RetrieverWithScore(BaseRetriever): + """Just a custom retriever to track distance between query and retrieval docs + + Args: + search_type (str): How to measure similarity + vector_store (FAISS): Retrieval Doc Embeddings + k: How many chunks + + """ + def __init__(self, + search_type: str, + vector_store: FAISS, + k: int, + score_threshold: int): + self.k = k + self.vector_store=vector_store + self.score_threshold = score_threshold + self.search_type=search_type + + def _get_relevant_documents(self, query: str) -> list[Document]: + # [NOTE] we removed the search type, only use search_type = "similarity" + if self.search_type != "similarity": + raise ValueError(f"Only search_type='similarity' is supported with scores") + docs_and_scores = self.vector_store.similarity_search_with_score(query=query, + k=self.k, + score_threshold=self.score_threshold) + for doc, distance in docs_and_scores: + doc.metadata = {**doc.metadata, **{"score": 1-distance}} + return [doc for (doc, _) in docs_and_scores] + + def aget_relevant_documents(self, query): + return self._get_relevant_documents(query) + + def get_relevant_documents(self, query: str) -> list[Document]: + return self._get_relevant_documents(query) + +__all__ = ['ChatBot'] + +EVAL_7B_TEMPLATE = (f'Answer the following question as one function, class, or object. If you do not know, just say "I do not know".' + '\n{context}' + '\nQuestion: {question}') + +CHAT_30B_TEMPLATE = ("""<|im_start|>system + A conversation between a user and an LLM-based AI assistant about the codebase for the MosaicML library Composer. + Provide a helpful and simple answer given the following context to the question. If you do not know, just say "I + do not know".<|im_end|> + <|im_start|>context + {context}<|im_end|> + <|im_start|>user + {question}<|im_end|> + <|im_start|>assistant""") +SUBQUERY_INTENT_TEMPLATE = ("""<|im_start|>system + A conversation between a user and an LLM-based AI assistant about the codebase for MosaicML. + Provide a helpful, short and simple answer given the following context to the question. Do not + attempt to explain any terms and do not go in depth. + If you do not know, just say "I do not know".<|im_end|> + <|im_start|>context + {context}<|im_end|> + <|im_start|>user + What is the user trying to learn from this question: {question}<|im_end|> + <|im_start|>assistant""") +SUBQUERY_RELATED_TEMPLATE = ("""<|im_start|>system + A conversation between a user and an LLM-based AI assistant about the codebase for MosaicML. + Only output a "Yes" or "No" with no extra information given the following context to the question. + The context must be related to the question, otherwise output "No". If you aren't sure, output "No".<|im_end|> + <|im_start|>context + {context}<|im_end|> + <|im_start|>user + Can this question be answered with information provided only from context: {question}<|im_end|> + <|im_start|>assistant""") +PARTIAL_SUBQA_TEMPLATE = ("""<|im_start|>system + A conversation between a user and an LLM-based AI assistant. + Given the context, the job of the assistant is to determine if the user's question is relevant given the context. If the given + context is unrelated to the question, then the assistant will break the question into smaller questions that can likely be answered + by a single section of the relevant context. Else if the question can't be answered using the context alone, the LLM-based AI assistant + should not reply with anything.<|im_end|> + <|im_start|>context + {{context}}<|im_end|> + <|im_start|>user + {{question}} {} Can this question be answered with the context given alone? If so, break the question down into less than five + smaller questions that can likely be answered by a single section of the relevant documentation. Make sure that the smaller question + is related to the main question. If you aren't sure, just don't include the smaller question + Please only respond with a list of smaller questions without any extra information.<|im_end|> + <|im_start|>assistant""") +PARTIAL_COMBINE_TEMPLATE = ("""<|im_start|>system A conversation between a user and an LLM-based AI assistant. + Here are smaller questions regarding the user's question and their answers: + {} + Provide a helpful and in depth answer given the following context to the question and heavily reference + the smaller questions provided. + If you do not know, just say "I do not know".<|im_end|> + <|im_start|>context + {{context}}<|im_end|> + <|im_start|>user + {{question}}<|im_end|> + <|im_start|>assistant""") + +class ChatBot: + """Given a folder of .txt files from data_path, create a Chatbot object that can process the files into documents, split them + into managable sizes, and store them in a vector store. The Chatbot can then be used to answer questions about the documents. + + Args: + data_path (str): The path of the directory where the txt files of interest is located + embedding (langchain.embeddings.base.Embeddings): The embedding to use for the vector store + model (langchain.llms.base.LLM): The model to use for the LLMChain + k (int): The number of similar documents to return from the vector store + chunk_size (int): The size of the chunks to split the documents into when splitting the documents + chunk_overlap (int): The amount of overlap between chunks when splitting the documents + + Example: + .. testcode:: + + + from langchain.embeddings import MosaicMLInstructorEmbeddings + from langchain.llms import MosaicML + chatbot = ChatBot(data_path= "support_chatbot/retrieval_data", + embedding=MosaicMLInstructorEmbeddings(), + k=3, + model=MosaicML()) + chatbot.chat() + + + """ + def __init__(self, + data_path: str, + embedding: langchain.embeddings.base.Embeddings, + model: langchain.llms.base.LLM, + chunk_size: int, + chunk_overlap: int, + k: int, + ) -> None: + + self.data_path = data_path + self.embedding = embedding + self.model = model + self.chunk_size = chunk_size + self.chunk_overlap = chunk_overlap + self.k = k + self.saved_state = {'k': k, 'chunk_size': chunk_size, 'chunk_overlap': chunk_overlap, 'model_k': model.model_kwargs['top_k']} + self.chat_chain = None + self.intent_chain = None + self.subchain = None + self.subsubchain = None + self.vector_store = None + + if os.path.isfile(os.path.join(data_path, 'vectors.pickle')): + with open(os.path.join(self.data_path, 'vectors.pickle'), 'rb') as f: + self.vector_store = pickle.load(f) + + def load_data(self) -> list[Document]: + """Given a directory find all .txt files and load them as documents into a list + + Returns: + list[Document]: list of documents loaded from data_dir + """ + data = [] + for dirpath, _, filenames in os.walk(self.data_path): + for filename in filenames: + if filename.endswith(".txt"): + file_path = os.path.join(dirpath, filename) + loaders = UnstructuredFileLoader(file_path, encoding='utf8') + document = loaders.load()[0] + document.metadata = {**document.metadata, **{'file_name': filename.replace('{slash}', '/').replace('{dot}', '.').replace('{colon}', ':')[:-4]}} + data.append(document) + return data + + def split_pages(self, + pages: list[Document]) -> list[Document]: + """Given a list of documents split them into smaller documents of size `self.chunk_size` + + Args: + pages (list[Document]): list of pages (Documents) we want to split + + Returns: + list[Document]: list of chunks (Documents) split from pages (Documents) + """ + text_splitter = RecursiveCharacterTextSplitter( + chunk_size=self.chunk_size, + chunk_overlap=self.chunk_overlap, + separators=[ + r'(?<=\.) ', + r'(?<=\?) ', + r'(?<=\!) ', + r'\n', + ], # Split on periods, question marks, exclamation marks, new lines, spaces, and empty strings, in the order + ) + return text_splitter.split_documents(pages) + + def documents_to_str(self, + documents: list[Document]) -> list[str]: + return map(lambda doc: doc.page_content, documents) + + def clean_response(self, input_text: str) -> str: + """Clean the response from the model by stripping some bad answer prefixes, new lines, etc. + + Args: + input_text (str): The response from the model. + + Returns: + str: The cleaned response. + """ + input_text = str(input_text.strip('\n')) + + context_prefix = 'Context:' + answer_prefix = 'Answer:' + prefixes = [context_prefix, answer_prefix] + while True: + prefix_found = False + for prefix in prefixes: + if input_text.startswith(prefix): + input_text = input_text[len(prefix):].strip() + input_text = input_text.strip('\n') + prefix_found = True + break + if not prefix_found: + break + + input_text = input_text.lstrip('\n :') + return str(input_text) + + def store_vectors(self, + pages: list[Document]) -> None: + """Given a list of documents, split them into chunks, and store them in a vector store. + + Args: + pages (list[Document]): list of pages (Documents) we have splitted + """ + content_batches = [] + content_current_batch = [] + + current_char_count = 0 + for page in pages: + content_current_batch.append(page) + current_char_count += len(page.page_content) + + if current_char_count > 1e4: + content_batches.append(content_current_batch) + content_current_batch = [] + current_char_count = 0 + + + if len(content_current_batch) > 0: + content_batches.append(content_current_batch) + + + txt_embeddings = [] + + for batch in tqdm(content_batches, desc='Embedding documents', total=len(content_batches)): + batch_embeddings = self.embedding.embed_documents([p.page_content for p in batch]) + txt_embeddings.extend(list(zip([p.page_content for p in batch], batch_embeddings))) + + # Component for storing the embeddings in a vector store, using FAISS + vector_store = FAISS.from_embeddings( + text_embeddings=txt_embeddings, + metadatas=[p.metadata for p in pages], + embedding=self.embedding + ) + + with open(os.path.join(ROOT_DIR, 'retrieval_data/vectors.pickle'), 'wb') as f: + pickle.dump(vector_store, f) + self.vector_store = vector_store + + def create_vector_store(self, repository_urls) -> None: + """Download the repositories, load the data, split the data into chunks, and store the chunks in a vector store. + + Args: + repository_urls (list[str]): list of repository urls to download + """ + scraper = WebScraper(path=self.data_path) + scraper.scrape() + for repo_url in repository_urls: + downloader = RepoDownloader(output_dir=self.data_path, current_dir="", repo_url=repo_url) + if os.path.exists(downloader.clone_dir): + continue + downloader.download_repo() + + pages = self.load_data() + documents = self.split_pages(pages) + self.store_vectors(documents) + + def create_chain(self, + prompt_template: str, + score_threshold: int=0.4) -> RetrievalQA: + """Create a RetrievalQAWithScores given a prompt template. + + Args: + prompt_template (str): The prompt template to use for the chain + """ + + retriever = RetrieverWithScore(search_type='similarity', + vector_store=self.vector_store, + k=self.k, + score_threshold=score_threshold) + + answer_question_prompt_template = PromptTemplate( + template=prompt_template, + input_variables=['context', 'question']) + + # Component connecting the LLM with the prompt template + llm_chain = LLMChain( + llm=self.model, + prompt=answer_question_prompt_template, + ) + + doc_prompt = PromptTemplate(input_variables=['page_content'], + template='Context:\n{page_content}') + + # Component connecting the context documents with the LLM chain + stuff_documents_chain = StuffDocumentsChain( + llm_chain=llm_chain, + document_variable_name='context', + document_prompt=doc_prompt, + ) + + # Complete component for retrieval question answering + chain = RetrievalQA( + retriever=retriever, + combine_documents_chain=stuff_documents_chain, + return_source_documents=True, + ) + + return chain + + def normalize_str(self, + answer: str): + """Lower text and remove punctuation, articles and extra whitespace. + + Copied from https://github.com/mandarjoshi90/triviaqa/blob/master/evaluation/triviaqa_evaluation.py + """ + + def remove_articles(text: str) -> str: + return re.sub(r'\b(a|an|the)\b', ' ', text) + + def white_space_fix(text: str) -> str: + return ' '.join(text.split()) + + def handle_punc(text: str) -> str: + exclude = set(string.punctuation + ''.join([u'‘', u'’', u'´', u'`'])) + return ''.join(ch if ch not in exclude else ' ' for ch in text) + + def lower(text: str) -> str: + return text.lower() + + def remove_parentheses(s): + return re.sub(r'\(.*?\)', '', s) + + def replace_underscore(s): + return re.sub('_', '-', s) + + return white_space_fix(remove_parentheses(remove_articles(handle_punc(lower(replace_underscore(answer)))))).strip() + + def set_eval_state(self) -> None: + """Set the state of the chatbot to the evaluation state. This is used to change the chunk size, chunk overlap, and k""" + self.chunk_overlap = 150 + self.chunk_size = 750 + self.k = 1 + self.model.model_kwargs['output_len'] = 40 + + def reload_chat_state(self) -> None: + """Reload the chatbot state to the saved state the user set when creating the chatbot""" + self.chunk_overlap = self.saved_state['chunk_overlap'] + self.chunk_size = self.saved_state['chunk_size'] + self.k = self.saved_state['k'] + + def evaluate_simple(self, + data_path: str, + answer_question_string_template: str) -> str: + """Evaluate the chatbot on simple retrieval dataset given a data_path and a chain + + Args: + data_path (str): The path to the dataset + answer_question_string_template (str): The prompt to use for the chain + + Returns: + str: The score of the chatbot on the dataset including number of exact matches, close matches, and total questions + """ + chain = self.create_chain(prompt_template=answer_question_string_template) + exact_match = 0 + close_match = 0 + total = 1 + total_lines = sum(1 for _ in open(data_path)) + + with open(data_path, 'r') as file: + for line in tqdm(file, total=total_lines, desc="Processing lines"): + data = json.loads(line) + question = data.get('context') + continuation = data.get('continuation') + response = chain(question) + answer = self.clean_response(response['result'].lstrip('\n')) + if self.normalize_str(answer) == self.normalize_str(continuation): + exact_match += 1 + elif self.normalize_str(continuation).replace(" ", "") in self.normalize_str(answer).replace(" ", ""): + close_match += 1 + else: + print('\n', self.normalize_str(answer), '||', self.normalize_str(continuation), '\n') + print(f'{exact_match} exact matches and {close_match} close matches out of {total} questions.') + total += 1 + time.sleep(0.5) + return f'Given Score: {(exact_match + 0.5*close_match)/ total} with {exact_match} exact matches and {close_match} close matches out of {total} questions.' + + def evaluate_complex(self, + data_path: str, + answer_question_string_template: str) -> str: + """Evaluate the chatbot on complex eval dataset given a data_path and a chain + + Args: + data_path (str): The path to the dataset + answer_question_string_template (str): The prompt to use for the chain + + Returns: + A long string of all questions, answers, and responses + """ + chain = self.create_chain(prompt_template=answer_question_string_template) + total_lines = sum(1 for _ in open(data_path)) + with open(data_path, 'r') as file: + save = '' + for line in tqdm(file, total=total_lines, desc="Processing lines"): + data = json.loads(line) + question = data.get('context') + continuation = data.get('continuation') + response = chain(question) + answer = self.clean_response(response['result'].lstrip('\n')) + save += f'Question:\n{question}\nAnswer:\n{continuation}\nResponse:\n{answer}\n\n' + return save + + def sub_query_chat(self, + query: str, + threshold = 0.4)-> str: + if not self.intent_chain: + save_k = self.k + self.k = 5 + self.intent_chain = self.create_chain(prompt_template=SUBQUERY_INTENT_TEMPLATE) + self.k = save_k + intent_response = self.intent_chain(query) + intent_answer = self.clean_response(intent_response['result'].lstrip('\n')) + + SUBQUERY_SUBQA_TEMPLATE = PARTIAL_SUBQA_TEMPLATE.format(intent_answer) + subQA_chain = self.create_chain(prompt_template=SUBQUERY_SUBQA_TEMPLATE) + subQA_response = subQA_chain(query) + subQA_answer = self.clean_response(subQA_response['result'].lstrip('\n')) + + all_sub_QA = subQA_answer.split('\n') + sub_QA_injection = '' + # Don't create a new chain on every query + if not self.subchain: + self.subchain = self.create_chain(prompt_template=CHAT_30B_TEMPLATE, score_threshold=threshold) + for sub_QA in all_sub_QA: + if sub_QA: + response = self.subchain(sub_QA) + answer = self.clean_response(response['result'].lstrip('\n')) + if response['source_documents'] and response["source_documents"][0].metadata["score"]>threshold: + answer = self.clean_response(response['result'].lstrip('\n')) + sub_QA_injection += f'Question: {sub_QA} \nAnswer: {answer}\n' + + if sub_QA_injection: + SUBQUERY_COMBINE_TEMPLATE = PARTIAL_COMBINE_TEMPLATE.format(str(sub_QA_injection).replace("{", "{{").replace("}", "}}")) + combine_chain = self.create_chain(prompt_template=SUBQUERY_COMBINE_TEMPLATE) + combine_response = combine_chain(query) + combine_answer = self.clean_response(combine_response['result'].lstrip('\n')) + combine_answer_sources = '' + for d in combine_response['source_documents']: + if d.metadata["score"] > 0.6: + combine_answer_sources = combine_answer_sources + f'{d.metadata["file_name"].replace("{slash}", "/")}\n' + + if not combine_answer_sources: + return f'Answer: \n{str(combine_answer)}\n\nIntent: \n{str(intent_answer)}\n\n Sub-questions: \n{str(sub_QA_injection)}' + else: + return f'Answer: \n{str(combine_answer)}\n\nIntent: \n{str(intent_answer)}\n\n Sub-questions: \n{str(sub_QA_injection)}\nSources: \n{str(combine_answer_sources)}' + else: + return f"I'm not sure but here is my best answer: \n{self.chat(query)[7:]}" + + + def relation_sub_query_chat(self, + query: str, + threshold: int=0.4)-> str: + if not self.intent_chain: + save_k = self.k + self.k = 3 + self.intent_chain = self.create_chain(prompt_template=SUBQUERY_INTENT_TEMPLATE) + self.k = save_k + intent_response = self.intent_chain(query) + intent_answer = self.clean_response(intent_response['result'].lstrip('\n')) + + SUBQUERY_SUBQA_TEMPLATE = PARTIAL_SUBQA_TEMPLATE.format(intent_answer) + subQA_chain = self.create_chain(prompt_template=SUBQUERY_SUBQA_TEMPLATE) + subQA_response = subQA_chain(query) + subQA_answer = self.clean_response(subQA_response['result'].lstrip('\n')) + + all_sub_QA = subQA_answer.split('\n') + sub_QA_injection = '' + # Don't create a new chain on every query + if not self.subsubchain: + save_k = self.k + self.k = 2 + self.subsubchain = self.create_chain(prompt_template=SUBQUERY_RELATED_TEMPLATE, score_threshold=0) + self.k = save_k + for sub_QA in all_sub_QA: + if sub_QA: + answerable = self.clean_response(self.subsubchain(sub_QA)['result'].lstrip('\n')) + if "Yes" in answerable: + if not self.subchain: + self.subchain = self.create_chain(prompt_template=CHAT_30B_TEMPLATE) + response = self.subchain(sub_QA) + answer = self.clean_response(response['result'].lstrip('\n')) + sub_QA_injection += f'Question: {sub_QA} \nAnswer: {answer}\n' + + if sub_QA_injection: + SUBQUERY_COMBINE_TEMPLATE = PARTIAL_COMBINE_TEMPLATE.format(str(sub_QA_injection).replace("{", "{{").replace("}", "}}")) + combine_chain = self.create_chain(prompt_template=SUBQUERY_COMBINE_TEMPLATE) + combine_response = combine_chain(query) + combine_answer = self.clean_response(combine_response['result'].lstrip('\n')) + sources = '' + for d in combine_response['source_documents']: + if d.metadata["score"] > threshold: + sources = sources + f'{d.metadata["file_name"].replace("{slash}", "/")}\n' + if not sources: + return f'Answer: \n{str(combine_answer)}\n\nIntent: \n{str(intent_answer)}\n\n Sub-questions: \n{str(sub_QA_injection)}' + else: + return f'Answer: \n{str(combine_answer)}\n\nIntent: \n{str(intent_answer)}\n\n Sub-questions: \n{str(sub_QA_injection)}\nSources: \n{str(sources)}' + else: + return f"I'm not sure but here is my best answer: \n{self.chat(query)[7:]}" + + def chat(self, + query: str) -> str: + """Chat with the chatbot given a query + + Args: + query (str): The query to ask the chatbot + """ + + # Don't create a new chain on every query + if not self.chat_chain: + self.chat_chain = self.create_chain(prompt_template=CHAT_30B_TEMPLATE, score_threshold=0) + response = self.chat_chain(query) + answer = self.clean_response(response['result'].lstrip('\n')) + sources = '' + for d in response['source_documents']: + if d.metadata["score"] > 0.6: + sources = sources + f'{d.metadata["file_name"].replace("{slash}", "/")}\n' + if not sources: + return f"Answer: \n{answer}" + else: + return f"Answer: \n{answer} \nSources: \n{sources}" \ No newline at end of file diff --git a/examples/end-to-end-examples/support_chatbot/eval_data/complex_eval.jsonl b/examples/end-to-end-examples/support_chatbot/eval_data/complex_eval.jsonl new file mode 100644 index 000000000..59e1bc704 --- /dev/null +++ b/examples/end-to-end-examples/support_chatbot/eval_data/complex_eval.jsonl @@ -0,0 +1,11 @@ +{"context": "How do the schedulers work with resume?", "continuation": "Resume should load all of the state, including the scheduler state"} +{"context": "What are some pros and cons of using EMA?", "continuation": "EMA trades off the quality of the model for faster training times"} +{"context": "What are some pros and cons of using ALiBi?", "continuation": "ALiBi increases training speed and is memory effeicient, but has some extra computational overhead"} +{"context": "What is the intuition behind LayerFreezing?", "continuation": "The intuition is that earlier layers learn their features sooner than later layers, so by freezing them we can reduce the workload in backpropagation"} +{"context": "What Callbacks should I use if I want to implement early stopping using Composer? Does such a callback even exist?", "continuation": "EarlyStopper and ThresholdStopper"} +{"context": "Why do I need to import all this stuff from composer instead of just using the PyTorch version?", "continuation": "Composer includes many optimizations that increase accuracy while reducing training time!"} +{"context": "What's the difference between a Callback and an Algorithm?", "continuation": "A callback is called during training to manually intervene with the training run while an algorithm is used to modify the training run calculations."} +{"context": "How do you set the save interval for checkpointing a training run?", "continuation": "there is a save_interval argument in Trainer"} +{"context": "Was it by design that composer doesn't automatically log the validation loss?", "continuation": "We currently don't log eval loss and instead recommend attaching a metric to compute eval loss if you would like to do so. This is because certain datasets for eval might not have a way to compute loss, so calling the loss function would not work and potentially break evaluation."} +{"context": "How do you implement the various speed up algorithms in a training run?", "continuation": "Initialize them in a list and set it to be the algorithms argument in Trainer"} +{"context": "What are the steps to finetune a model?", "continuation": "Install Composer, create dataloaders, set optimizers and learning rates, load the pretrained model, run the Trainer"} diff --git a/examples/end-to-end-examples/support_chatbot/eval_data/composer_docstrings.jsonl b/examples/end-to-end-examples/support_chatbot/eval_data/composer_docstrings.jsonl new file mode 100644 index 000000000..8343bf0a8 --- /dev/null +++ b/examples/end-to-end-examples/support_chatbot/eval_data/composer_docstrings.jsonl @@ -0,0 +1,653 @@ +{"context": "What algorithm dispenses with position embeddings for tokens in transformer-based NLP models, instead encoding position information by biasing the query-key attention scores proportionally to each token pair's distance?", "continuation": "ALiBi"} +{"context": "What algorithm yields excellent extrapolation to unseen sequence lengths compared to other position embedding schemes?", "continuation": "AliBi"} +{"context": "What function removes position embeddings and replaces the attention function and attention mask in AliBi?", "continuation": "apply_alibi()"} +{"context": "What algorithm creates multiple independent realizations of sequences of image augmentations, applies each sequence with random intensity, and returns a convex combination of the augmented images and the original image?", "continuation": "AugMix"} +{"context": "What function applies the AugMix data augmentation?", "continuation": "augmix_image()"} +{"context": "What creates width sequences of depth image augmentations, applies each sequence with random intensity, and returns a convex combination of the width augmented images and the original image such that coefficients for mixing the augmented images are drawn from a uniform Dirichlet(alpha, alpha, ...) distribution and the coefficient for mixing the combined augmented image and the original image is drawn from a Beta(alpha, alpha) distribution, using the same alpha?", "continuation": "AugMix"} +{"context": "What adds anti-aliasing filters to convolutional layers to increase accuracy and invariance to small shifts in the input?", "continuation": "blurpool"} +{"context": "What blurpool function applies a spatial low-pass filter?", "continuation": "blur_2d()"} +{"context": "What blurpool function applies max-pooling with anti-aliasing?", "continuation": "blurmax_pool2d()"} +{"context": "What blurpool function can be understood as decoupling the max from the pooling, and inserting a low-pass filtering step between the two, then computes the max within spatial neighborhoods of shape kernel_size, then applies an anti-aliasing filter to smooth the maxes, and only then pools according to stride?", "continuation": "blurmax_pool2d()"} +{"context": "What module is a drop-in replacement for torch.nn.MaxPool2d, but with an anti-aliasing filter?", "continuation": "BlurMaxPool2d"} +{"context": "What module is a drop-in replacement for torch.nn.Conv2d, but with an anti-aliasing filter?", "continuation": "BlurConv2d"} +{"context": "What module just calls the function blur_2d in forward using the provided arguments?", "continuation": "BlurConv2d"} +{"context": "What algorithm adds anti-aliasing filters to convolutional layers and increases accuracy and invariance to small shifts in the input?\nAnswer: ", "continuation": "BlurPool"} +{"context": "What function changes the memory format of the model to torch.channels_last and usually yields improved GPU utilization?", "continuation": "apply_channels_last()"} +{"context": "What algorithm changes the memory format of the model to torch.channels_last and usually yields improved GPU utilization?", "continuation": "ChannelsLast"} +{"context": "What algorithm drops a fraction of the rows and columns of an input image?", "continuation": "ColOut"} +{"context": "What function applies ColOut augmentation to a batch of images and (optionally) targets, dropping the same random rows and columns from all images and targets in a batch?", "continuation": "colout_batch()"} +{"context": "What object applies Torchvision-like transformations for performing the ColOut augmentation,where random rows and columns are dropped from up to two Torch tensors or two PIL images?", "continuation": "ColOutTransform"} +{"context": "What algorithm trains the network on non-overlapping combinations of pairs of examples and iterpolated targets rather than individual examples and targets?", "continuation": "CutMix"} +{"context": "What function creates new samples using combinations of pairs of samples by masking a region of each image in input and filling the masked region with the corresponding content from a random different image in input? The position of the masked region is determined by drawing a center point uniformly at random from all spatial positions.", "continuation": "cutmix_batch()"} +{"context": "What function generates indices of a random permutation of elements of a batch?", "continuation": "_gen_indices()"} +{"context": "What function generates lambda from Beta(alpha, alpha)?", "continuation": "_gen_cutmix_coef()"} +{"context": "What function randomly samples a bounding box with area determined by input cutmix_lambda?", "continuation": "_rand_bbox()"} +{"context": "What function rescales the cutmix lambda according to the size of the clipped bounding box?", "continuation": "_adjust_lambda()"} +{"context": "What algorithm is a data augmentation technique that works by masking out one or more square regions of an input image?", "continuation": "CutOut"} +{"context": "What function applies the CutOut augmentation to a batch of images?", "continuation": "cutout_batch()"} +{"context": "What algorithm maintains a moving average of model parameters and uses these at test time", "continuation": "EMA"} +{"context": "What function updates the weights of a ema_model to be closer to the weights of input model according to an exponential weighted average?", "continuation": "compute_ema()"} +{"context": "What function summons full params for FSDP, which is required to update sharded params?", "continuation": "get_model_context_manager()"} +{"context": "What EMA method ensures state dicts created prior to Composer 0.13.0 are compatible with later versions?", "continuation": "ensure_compatible_state_dict()"} +{"context": "What EMA method replaces the parameters of the supplied model with the ema parameters if they are not already active?", "continuation": "get_ema_model()"} +{"context": "What EMA method replaces the parameters of the supplied model with the training parameters if they are not already active?", "continuation": "get_training_model()"} +{"context": "What class stores the parameters and buffers of a model needed for averaging?", "continuation": "EMAParameters"} +{"context": "What EMAParameters swaps the parameters and buffers of a model with the ema parameters?", "continuation": "swap_params()"} +{"context": "What EMAParameters transfers the parameters and buffers from the ema model to the supplied model?", "continuation": "transfer_ema_params()"} +{"context": "What EMAParameters moves the ema parameters and buffers to the device of a destination model?", "continuation": "move_params_to_device()"} +{"context": "What algorithm decomposes linear operators into pairs of smaller linear operators?", "continuation": "Factorize"} +{"context": "What class bundles tensors used by a factorized linear operator?", "continuation": "LowRankSolution"} +{"context": "What function approximates a matrix by factorizing it into a product of two smaller matrices?", "continuation": "factorize_matrix()"} +{"context": "What function returns whether factorizing a module a given amount could possibly yield a benefit?", "continuation": "factorizing_could_speedup()"} +{"context": "What module is a factorized replacement for torch.nn.Conv2d?", "continuation": "FactorizedConv2d"} +{"context": "What module is a factorized replacement for torch.nn.Linear?", "continuation": "FactorizedLinear"} +{"context": "What function replaces torch.nn.Linear and torch.nn.Conv2d modules with FactorizedLinear and FactorizedConv2d modules?", "continuation": "apply_factorization()"} +{"context": "What algorithm replaces all instances of torch.nn.LayerNorm with a apex.normalization.fused_layer_norm.FusedLayerNorm? By fusing multiple kernel launches into one, this usually improves GPU utilization.", "continuation": "FusedLayerNorm"} +{"context": "What function defines a replacement policy from a torch.nn.LayerNorm to a apex.normalization.fused_layer_norm?", "continuation": "from_LayerNorm()"} +{"context": "What function replaces all instances of torch.nn.LayerNorm with a apex.normalization.fused_layer_norm.FusedLayerNorm?", "continuation": "apply_fused_layernorm()"} +{"context": "What function replaces the Linear layers in the feed-forward network with Gated Linear Units? This leads to improved convergence with a slight drop in throughput.", "continuation": "GatedLinearUnits"} +{"context": "What module defines a single feed-forward block that uses Gated Linear Units", "continuation": "BERTGatedFFOutput"} +{"context": "What function defines a replacement policy from a transformers.models.bert.modeling_bert.BertOutput to a composer.algorithms.gated_linear_units.gated_linear_unit_layers.BERTGatedFFOutput?", "continuation": "from_BertOutput()"} +{"context": "What function defines a replacement policy from a transformers.models.bert.modeling_bert.BertIntermediate to a torch.nn.Identity? The identity effectively acts as no-op.", "continuation": "from_BertIntermediate"} +{"context": "What function replaces the Linear layers in the feed-forward network with Gated Linear Units?", "continuation": "apply_gated_linear_units()"} +{"context": "What function replace batch normalization modules with ghost batch normalization modules?", "continuation": "apply_ghost_batchnorm()"} +{"context": "What algorithm replaces batch normalization modules with Ghost Batch Normalization that simulate the effect of using a smaller batch size.", "continuation": "GhostBatchNorm"} +{"context": "What funtion clips all gradients in model based on specified clipping_type?", "continuation": "apply_gradient_clipping()"} +{"context": "What algorithm clips all gradients in model based on specified clipping_type?", "continuation": "GradientClipping"} +{"context": "What GradientClipping method slips all gradients in model based on ratio of gradient norms to parameter norms?", "continuation": "_get_clipped_gradient_coeff()"} +{"context": "What GradientClipping method implements unitwise norm?", "continuation": "_unitwise_norm()"} +{"context": "What function replaces all instances of torch.nn.Dropout with a GyroDropout? By masking Dropout layer, this usually improves accuracy.", "continuation": "apply_gyro_dropout()"} +{"context": "What algorithm replaces all instances of torch.nn.Dropout with a GyroDropout? By masking Dropout layer, this usually improves accuracy.", "continuation": "GyroDropout"} +{"context": "What function shrink targets towards a uniform distribution?", "continuation": "smooth_labels()"} +{"context": "What algorithm shrink targets towards a uniform distribution", "continuation": "LabelSmoothing"} +{"context": "What function progressively freeze the layers of the network in-place during training, starting with the earlier layers?", "continuation": "freeze_layers()"} +{"context": "What algorithm progressively freeze the layers of the network in-place during training, starting with the earlier layers?", "continuation": "LayerFreezing"} +{"context": "What LayerFreezing method implements a linear schedule for freezing?", "continuation": "_freeze_schedule()"} +{"context": "What LayerFreezing method is a helper function to get all submodules?", "continuation": "_get_layers()"} +{"context": "What LayerFreezing method is a helper function to freeze the training of a parameter", "continuation": "_remove_param_from_optimizers()"} +{"context": "What algorithm replaces all instances of torch.nn.GroupNorm with LPGroupNorm where LPGroupNorm is a thin wrapper around torch.nn.GroupNorm which forces the layer to run in lower precision if autocast is enabled?", "continuation": "LowPrecisionGroupNorm"} +{"context": "What algorithm replaces all instances of torch.nn.LayerNorm with LPLayerNorm where LPLayerNorm is a thin wrapper around torch.nn.LayerNorm which forces the layer to run in lower precision if autocast is enabled?", "continuation": "LowPrecisionLayerNorm"} +{"context": "What LPLayerNorm method defines a replacement policy from a torch.nn.LayerNorm to a LPLayerNorm?", "continuation": "_to_LPLayerNorm()"} +{"context": "What LPlayerNorm method defines a replacement policy from a torch.nn.LayerNorm to a apex.normalization.fused_layer_norm?", "continuation": "_to_FusedLayerNorm()"} +{"context": "What function creates new samples using convex combinations of pairs of samples?", "continuation": "mixup_batch()"} +{"context": "What algorithm uses individual examples and targets to make a convex combination of a given batch X with a randomly permuted copy of X? The mixing coefficient is drawn from a Beta(alpha, alpha) distribution.", "continuation": "MixUp"} +{"context": "What function samples max(z, 1-z), z ~ Beta(alpha, alpha) for MixUp?", "continuation": "_gen_mixing_coef()"} +{"context": "What function generates a random permutation of the batch indices?", "continuation": "_gen_indices()"} +{"context": "What object is a dummy model used for performance measurements?", "continuation": "NoOpModelClass"} +{"context": "What algorithm replaces torch.nn.Module with a dummy model of type NoOpModelClass?", "continuation": "NoOpModelClass"} +{"context": "What function resizes inputs and optionally outputs by cropping or interpolating?", "continuation": "resize_batch()"} +{"context": "What algorithm resizes inputs and optionally outputs by cropping or interpolating?", "continuation": "ProgressiveResizing"} +{"context": "What ProgressiveResizing method makes a random crop transform for an input image?", "continuation": "_make_crop()"} +{"context": "What ProgressiveResizing method makes a pair of random crops for an input image X and target tensor y?", "continuation": "_make_crop_pair()"} +{"context": "What ProgressiveResizing method makes a nearest-neighbor interpolation transform at the specified scale factor?", "continuation": "_make_resize()"} +{"context": "What function randomly applies a sequence of image data augmentations to an image or batch of image?", "continuation": "randaugment_image()"} +{"context": "What algorithm randomly applies a sequence of image data augmentations to an image?", "continuation": "RandAugment"} +{"context": "What object wraps an optimizer with sharpness-aware minimization?", "continuation": "SAMOptimizer"} +{"context": "What algorithm adds sharpness-aware minimization by wrapping an existing optimizer with a SAMOptimizer? SAM can improve model generalization and provide robustness to label noise.", "continuation": "SAM"} +{"context": "What function decides if selective backprop should be run based on time in training?", "continuation": "should_selective_backprop()"} +{"context": "What function prunes minibatches as a subroutine of SelectiveBackprop and computes the loss function on the provided training examples and runs minibatches according to the difficulty? The fraction of the minibatch that is kept for gradient computation is specified by the argument 0 <= keep <= 1.", "continuation": "select_using_loss()"} +{"context": "What algorithm selectively backpropagate gradients from a subset of each batch", "continuation": "SelectiveBackprop"} +{"context": "What function sets the sequence length of a batch by changing the sequence length of all tensors in the provided dictionary to curr_seq_len by either truncating the tensors or reshaping the tensors to create new examples from the extra tokens?", "continuation": "set_batch_sequence_length()"} +{"context": "What algorithm progressively increases the sequence length during training?", "continuation": "SeqLengthWarmup"} +{"context": "What function adds Squeeze-and-Excitation blocks after after torch.nn.Conv2d layer?", "continuation": "apply_squeeze_excite()"} +{"context": "What object adds Squeeze-and-Excitation blocks, which applies global average pooling to the input, feeds the resulting vector to a single-hidden-layer fully-connected network (MLP), and uses the outputs of this MLP as attention coefficients to rescale the input. This allows the network to take into account global information about each input, as opposed to only local receptive fields like in a convolutional layer.", "continuation": "SqueezeExcite2d"} +{"context": "What object is a helper class used to add a SqueezeExcite2d module after a torch.nn.Conv2d?", "continuation": "SqueezeExciteConv2d"} +{"context": "What algorithm adds Squeeze-and-Excitation blocks after the torch.nn.Conv2d modules in a neural network?", "continuation": "SqueezeExcite"} +{"context": "What function applies Stochastic Depth to the specified model?", "continuation": "apply_stochastic_depth()"} +{"context": "What algorithm replaces the specified target layer with a stochastic version of the layer? The stochastic layer will randomly drop either samples or the layer itself depending on the stochastic method specified. The block-wise?", "continuation": "StochasticDepth"} +{"context": "What function is the ResNet Bottleneck forward function where the layers are randomly skipped with probability drop_rate during training?", "continuation": "block_stochastic_forward()"} +{"context": "What function randomly drops samples from the input batch according to the sample_drop_rate by setting the samples to be dropped to zeros?", "continuation": "_sample_drop()"} +{"context": "What function is the model surgery policy that dictates how to convert a ResNet Bottleneck layer into a stochastic version?", "continuation": "make_resnet_bottleneck_stochastic()"} +{"context": "What module is a convenience class that stochastically executes the provided main path of a residual block", "continuation": "BlockStochasticModule"} +{"context": "What algorithm applies Stochastic Weight Averaging, which averages model weights sampled at different times near the end of training? This leads to better generalization than just using the final trained weights.", "continuation": "SWA"} +{"context": "What algorithm needs to maintain both the current value of the weights and the average of all of the sampled weights, which doubles the model's memory consumption?", "continuation": "SWA"} +{"context": "What function converts between torch.Tensor and PIL.Image.Image image representations?", "continuation": "image_as_type()"} +{"context": "What function lifts a function that requires pillow images to also work on tensors?", "continuation": "map_pillow_function()"} +{"context": "Name a function that is a helper function to scale a value between 0 and maxval and return as an int?", "continuation": "_int_parameter()"} +{"context": "What is a function to scale a value between 0 and maxval and return as a float?", "continuation": "_float_parameter()"} +{"context": "What is a function to sample from a uniform distribution between 0.1 and some value n", "continuation": "_sample_level()"} +{"context": "What is a function to sample from a symmetric distribution?", "continuation": "_symmetric_sample()"} +{"context": "What function autocontrasts an image?", "continuation": "autocontrast()"} +{"context": "What function equalizes an image?", "continuation": "equalize()"} +{"context": "What function posterizes an image?", "continuation": "posterize()"} +{"context": "What function rotates an image?", "continuation": "rotate()"} +{"context": "What function solarizes an image?", "continuation": "solarize()"} +{"context": "What function shears an image horizontally?", "continuation": "shear_x()"} +{"context": "What function shears an image vertically?", "continuation": "shear_y()"} +{"context": "What function translates an image horizontally?", "continuation": "translate_x()"} +{"context": "What function translates an image vertically?", "continuation": "translate_y()"} +{"context": "What function enhances color on an image?", "continuation": "color()"} +{"context": "What function enhances color on an image, following the corruptions in the ImageNet-C/CIFAR10-C test sets?", "continuation": "color_original()"} +{"context": "What function enhances contrast on an image?", "continuation": "contrast()?"} +{"context": "What function enhances contrast on an image, following the corruptions in the ImageNet-C/CIFAR10-C test sets?", "continuation": "contrast_original()"} +{"context": "What function enhances brightness on an image?", "continuation": "brightness()?"} +{"context": "What function enhances brightness on an image, following the corruptions in the ImageNet-C/CIFAR10-C test sets?", "continuation": "brightness_original()"} +{"context": "What function enhances sharpness on an image?", "continuation": "sharpness()?"} +{"context": "What function enhances sharpness on an image, following the corruptions in the ImageNet-C/CIFAR10-C test sets?", "continuation": "sharpness_original()"} +{"context": "What function standardizs the input weight W?", "continuation": "_standardize_weights()"} +{"context": "What module should you use to apply weight standardization with torch's parametrization package?", "continuation": "WeightStandardizer"} +{"context": "What function applies weight Standardization, which standardizes convolutional weights in a model?", "continuation": "apply_weight_standardization()"} +{"context": "What module standardizes convolutional weights in a model?", "continuation": "WeightStandardization"} +{"context": "What callback logs stats of activation inputs and outputs?", "continuation": "ActivationMonitor"} +{"context": "What function helps create a checkpoint scheduler according to a specified interval?", "continuation": "checkpoint_periodically()"} +{"context": "What callback saves checkpoints?", "continuation": "CheckpointSaver"} +{"context": "What callback tracks a metric and halts training if it does not improve within a given interval?", "continuation": "EarlyStopper"} +{"context": "What callback exports models for inference?", "continuation": "ExportForInferenceCallback"} +{"context": "What callback checks for GPU health?", "continuation": "HealthChecker"} +{"context": "What callback logs image inputs and optionally outputs?", "continuation": "ImageVisualizer"} +{"context": "What callback logs the learning rate?", "continuation": "LRMonitor"} +{"context": "What callback logs the memory usage of the model?", "continuation": "MemoryMonitor"} +{"context": "What callback creates compliant results file for MLPerf Training benchmark?", "continuation": "MLPerfCallback"} +{"context": "What callback computes and logs the L2 norm of gradients as well as any optimizer-specific metrics implemented in the optimizer's report_per_parameter_metrics method?", "continuation": "OptimizerMonitor"} +{"context": "What callback estimates total training time?", "continuation": "RuntimeEstimator"} +{"context": "What callback logs the training throughput and utilization?", "continuation": "SpeedMonitor"} +{"context": "What callback halts training when a metric value reaches a certain threshold?", "continuation": "ThresholdStopper"} +{"context": "What file runs the Composer CLI launcher for distributed training?", "continuation": "launcher.py"} +{"context": "What are pieces of code which run at specific events (Event) in the training loop and modify the trainer's State, generally with the effect of improving the model's quality or increasing the efficiency and throughput of the training loop?", "continuation": "Algorithms"} +{"context": "What function indicates whether this algorithm may cause some model parameters to be unused?", "continuation": "find_unused_parameters()"} +{"context": "What function outputs whether this algorithm requires the backwards pass to be differentiable?", "continuation": "backwards_create_graph()"} +{"context": "What function returns True to indicate this algorithm is required when loading from a checkpoint which used it?", "continuation": "required_on_load()"} +{"context": "What function determines whether this algorithm should run given the current Event and State?", "continuation": "match()"} +{"context": "What function applies the algorithm to make an in-place change to the State?", "continuation": "apply()"} +{"context": "What objects provide hooks that can run at each training loop Event and is similar to an Algorithm in that they are run on specific events, but it differs from an Algorithm in that it should not modify the training of the model?", "continuation": "Callback"} +{"context": "What Callback method is called by engine on each event?", "continuation": "run_event()"} +{"context": "What Callback method is called by the Event.INIT event?", "continuation": "init()"} +{"context": "What Callback method is called by the Event.AFTER_LOAD event?", "continuation": "after_load()"} +{"context": "What Callback method is called by the Event.FIT_START event?", "continuation": "fit_start()"} +{"context": "What Callback method is called by the Event.EPOCH_START event?", "continuation": "epoch_start()"} +{"context": "What Callback method is called by the Event.BEFORE_DATALOADER event?", "continuation": "before_dataloader()"} +{"context": "What Callback method is called by the Event.AFTER_DATALOADER event?", "continuation": "after_dataloader()"} +{"context": "What Callback method is called by the Event.BATCH_START event?", "continuation": "batch_start()"} +{"context": "What Callback method is called by the Event.BEFORE_TRAIN_BATCH event?", "continuation": "before_train_batch()"} +{"context": "What Callback method is called by the Event.BEFORE_FORWARD event?", "continuation": "before_forward()"} +{"context": "What Callback method is called by the Event.BEFORE_LOSS event?", "continuation": "before_loss()"} +{"context": "What Callback method is called by the Event.AFTER_LOSS event?", "continuation": "after_loss()"} +{"context": "What Callback method is called by the Event.BEFORE_BACKWARD event?", "continuation": "before_backward()"} +{"context": "What Callback method is called by the Event.AFTER_BACKWARD event?", "continuation": "after_backward()"} +{"context": "What Callback method is called by the Event.AFTER_TRAIN_BATCH event?", "continuation": "after_train_batch()"} +{"context": "What Callback method is called by the Event.BATCH_END event?", "continuation": "batch_end()"} +{"context": "What Callback method is called by the Event.BATCH_CHECKPOINT event?", "continuation": "batch_checkpoint()"} +{"context": "What Callback method is called by the Event.EPOCH_END event?", "continuation": "epoch_end()"} +{"context": "What Callback method is called by the Event.EPOCH_CHECKPOINT event?", "continuation": "epoch_checkpoint()"} +{"context": "What Callback method is called by the Event.PREDICT_START event?", "continuation": "predict_start()"} +{"context": "What Callback method is called by the Event.PREDICT_BATCH_START event?", "continuation": "predict_batch_start"} +{"context": "What Callback method is called by the Event.PREDICT_BATCH_FORWARD event?", "continuation": "predict_before_forward()"} +{"context": "What Callback method is called by the Event.PREDICT_AFTER_FORWARD event?", "continuation": "predict_after_forward()"} +{"context": "What Callback method is called by the Event.PREDICT_BATCH_END event?", "continuation": "predict_batch_end()"} +{"context": "What Callback method is called by the Event.PREDICT_END event?", "continuation": "predict_end()"} +{"context": "What Callback method is called by the Event.EVAL_BEFORE_ALL event?", "continuation": "eval_before_all()"} +{"context": "What Callback method is called by the Event.EVAL_START event?", "continuation": "eval_start()"} +{"context": "What Callback method is called by the Event.EVAL_BATCH_START event?", "continuation": "eval_batch_start()"} +{"context": "What Callback method is called by the Event.EVAL_BATCH_FORWARD event?", "continuation": "eval_before_forward()"} +{"context": "What Callback method is called by the Event.EVAL_AFTER_FORWARD event?", "continuation": "eval_after_forward()"} +{"context": "What Callback method is called by the Event.EVAL_BATCH_END event?", "continuation": "eval_batch_end()"} +{"context": "What Callback method is called by the Event.EVAL_END event?", "continuation": "eval_end()"} +{"context": "What Callback method is called by the Event.EVAL_AFTER_ALL event?", "continuation": "eval_after_all()"} +{"context": "What Callback method is called by the Event.FIT_END event?", "continuation": "fit_end()"} +{"context": "What Callback method is called whenever the trainer finishes training, even when there is an exception?", "continuation": "close()"} +{"context": "What Callback method is called after Callback method close() has been invoked for each callback?", "continuation": "post_close()"} +{"context": "What function splits batches into chunks of size microbatch_size for gradient accumulation?", "continuation": "_default_split_batch()"} +{"context": "What object contains specifications for operating and training on data?", "continuation": "DataSpec"} +{"context": "What function ensures that the dataloader is a DataSpec?", "continuation": "ensure_data_spec()"} +{"context": "What is a coordinator for running algorithms and resolving ordering conflicts among them for composition?", "continuation": "Engine"} +{"context": "Does the order in which algorithms are run matter?", "continuation": "Yes"} +{"context": "What object records the algorithm's execution", "continuation": "Trace"} +{"context": "What Engine method runs the sequence of algorithms and callbacks?", "continuation": "run_event()"} +{"context": "What Engine method runs the marker for an event if the profiler is enabled?", "continuation": "run_marker_only_event()"} +{"context": "What Engine method registers an algorithm pass with the Engine?", "continuation": "register_pass()"} +{"context": "What Engine method runs compilation passes that modify the order and content of the list of algorithms?", "continuation": "_compile()"} +{"context": "What Engine method checks for open callbacks from previous runs and raises an error if so?", "continuation": "_check_for_still_open_callbacks()"} +{"context": "What Engine method runs a sequence of callbacks by calling the function for an event?", "continuation": "_run_callbacks()"} +{"context": "What Engine method includes timestampp and even info in log messages?", "continuation": "_debug_log()"} +{"context": "What Engine method shuts down the engine?", "continuation": "close()"} +{"context": "What function generates an evaluation interval callable?", "continuation": "evaluate_periodically()"} +{"context": "What object is a wrapper for a dataloader to include metrics that apply to a specific dataset?", "continuation": "Evaluator"} +{"context": "What function ensures that evaluator is an Evaluator type?", "continuation": "ensure_evaluator()"} +{"context": "What function ensures that automicrobatching is only on GPU?", "continuation": "validate_eval_automicrobatching()"} +{"context": "What function sets initial value of device_eval_microbatch_size?", "continuation": "_get_initial_device_eval_microbatch_size()"} +{"context": "What object is an Enum to represent training loop events that mark specific point in the training loop where an Algorithm and Callback can run?", "continuation": "Event"} +{"context": "What reorders or modifies the execution of algorithms by the Engine?", "continuation": "Algorithm Passes"} +{"context": "What function sorts instances of a provided class to the front?", "continuation": "sort_to_front()"} +{"context": "What function sorts instances of a provided class to the back?", "continuation": "sort_to_back()"} +{"context": "What should run before any algorithms modify the loss?", "continuation": "Selective Backprop"} +{"context": "What should run after other algorithms that add LayerNorms?", "continuation": "FusedLayerNorm"} +{"context": "What should run after other algorithms that add LayerNorms?", "continuation": "LowPrecisionLayerNorm"} +{"context": "What function establishes a FILO order of algorithms before and after events?", "continuation": "set_filo_order()"} +{"context": "What function throws a warning when multiple algorithms that interpolate the loss?", "continuation": "warn_if_multiple_loss_interpolation()"} +{"context": "What object is an enum class for the numerical precision to be used by the model?", "continuation": "Precision"} +{"context": "What function a returns a context manager to automatically cast to a specific precision?", "continuation": "get_precision_context()"} +{"context": "What object is the interface for seriealization; used in checkpointing?", "continuation": "Serializable"} +{"context": "What Serializable method returns a dictionary representing the internal state?", "continuation": "state_dict()"} +{"context": "What Serializable method retores the state of the object?", "continuation": "load_state_dict()"} +{"context": "What function is the context manager for materializing or loading an fsdp module's state dict?", "continuation": "fsdp_state_dict_type_context()"} +{"context": "What function materializes a given model's optimizer's state_dict", "continuation": "fsdp_get_optim_state_dict()"} +{"context": "What object reflects the state of the trainer?", "continuation": "State"} +{"context": "What State method gets the dataset contained by the given dataloader-like object?", "continuation": "_dataset_of()"} +{"context": "What State method gets the train dataloader?", "continuation": "train_dataloader()"} +{"context": "What State method gets the elapsed training duration?", "continuation": "get_elapsed_duration()"} +{"context": "What State method stops training?", "continuation": "stop training()"} +{"context": "What State method gets lement from batch either specified by key or user-specified function?", "continuation": "batch_get_item()"} +{"context": "What State method sets the element specified by the key of the set_fn to the specified value?", "continuation": "batch_set_item()"} +{"context": "What State method returns the State callbacks?", "continuation": "callbacks()"} +{"context": "What State method returns the State algorithms?", "continuation": "algorithms()"} +{"context": "What State method returns the State evaluators?", "continuation": "evaluators()"} +{"context": "What State method returns if deepspeed is enabled?", "continuation": "deepspeed_enabled()"} +{"context": "What State method returns if fsdp is enabled?", "continuation": "fsdp_enabled()"} +{"context": "What State method gets a dictionary of information about integrations to store in the state dict? This metadata is used for loading things from state dict that need to be done outside of the normal Composer load path?", "continuation": "_get_integrations_state_dict()"} +{"context": "What State method gets a dictionary of metadata to store in the state dict? This metadata is used for checking compatibility between the current environment/setup and the environment/setup that was used for the checkpoint that is being loaded in?", "continuation": "_get_state_metadata()"} +{"context": "What State method collect the state dict(s) of our train and eval dataset(s)?", "continuation": "_dataset_state_dict()"} +{"context": "What State method collect the state dicts of our serializable attributes?", "continuation": "state_dict()"} +{"context": "What State method applies required algorithms which haven't been specified and aren't in the exclude list?", "continuation": "_apply_required_algorithms()"} +{"context": "What State method loads the model's state from state_dict", "continuation": "load_model_state()"} +{"context": "What State method loads the optimizer state?", "continuation": "load_optim_state()"} +{"context": "What State method loads the dataset state", "continuation": "_load_dataset_state()"} +{"context": "What State method loads the state?", "continuation": "load_state_dict()"} +{"context": "What State method loads the active dataloader?", "continuation": "dataloader()"} +{"context": "What State method returns tje dataloader label for the active dataloader?", "continuation": "dataloader_label()"} +{"context": "What State method updates the active dataloader and dataloader label?", "continuation": "set_dataloader()"} +{"context": "What State method returns the number of batches per dataloader iteration as used by the trainer?", "continuation": "dataloader_len()"} +{"context": "What State method returns the numerical precision to use for training?", "continuation": "precision()"} +{"context": "What State method returns whether the model is an instance of DistributedDataParallel?", "continuation": "is_model_ddp()"} +{"context": "What State method casts the model to deepspeed.DeepSpeedEngine?", "continuation": "deepspeed_model()"} +{"context": "What object is a enum class to represent units of time for the training process", "continuation": "TimeUnit"} +{"context": "What is EPOCH typically denoted as?", "continuation": "ep"} +{"context": "What is BATCH typically denoted as?", "continuation": "ba"} +{"context": "What is SAMPLE typically denoted as?", "continuation": "sp"} +{"context": "What is TOKEN typically denoted as?", "continuation": "tok"} +{"context": "What is DURATION typically denoted as?", "continuation": "dur"} +{"context": "What object represents static durations of training time in terms of a TimeUnit enum?", "continuation": "Time"} +{"context": "What Time method creates a Time object with units of TimeUnit.EPOCH?", "continuation": "from_epoch()"} +{"context": "What Time method creates a Time object with units of TimeUnit.BATCH?", "continuation": "from_batch()"} +{"context": "What Time method creates a Time object with units of TimeUnit.SAMPLE?", "continuation": "from_sample()"} +{"context": "What Time method creates a Time object with units of TimeUnit.TOKEN?", "continuation": "from_token()"} +{"context": "What Time method creates a Time object with units of TimeUnit.DURATION?", "continuation": "from_duration()"} +{"context": "What Time method returns the value of the time as a number?", "continuation": "value()"} +{"context": "What Time method returns the unit of the time?", "continuation": "unit()"} +{"context": "What Time method returns time-string representation?", "continuation": "to_timestring()"} +{"context": "What Time method parses other objects into a Time object?", "continuation": "_parse()"} +{"context": "What Time method parses a time string into a Time instance?", "continuation": "from_timestring()"} +{"context": "What object represents a snapshot of the current training progress measures training progress in terms of epochs, batches, samples, tokens, and wall clock time?", "continuation": "Timestamp"} +{"context": "What Timestamp method returns all values of the timestamp object in a dictionary?", "continuation": "get_state()"} +{"context": "What Timestamp method returns the total epoch count?", "continuation": "epoch()"} +{"context": "What Timestamp method returns the total batch count?", "continuation": "batch()"} +{"context": "What Timestamp method returns the total sample count?", "continuation": "sample()"} +{"context": "What Timestamp method returns the total token count?", "continuation": "token()"} +{"context": "What Timestamp method returns the batch count in the current epoch?", "continuation": "batch_in_epoch()"} +{"context": "What Timestamp method returns the sample count in the current epoch?", "continuation": "sample_in_epoch()"} +{"context": "What Timestamp method returns the token count in the current epoch?", "continuation": "token_in_epoch()"} +{"context": "What Timestamp method returns the wall-clock duration (in seconds) from the beginning of training?", "continuation": "total_wct()"} +{"context": "What Timestamp method returns the wall-clock duration (in seconds) for the current epoch?", "continuation": "epoch_wct()"} +{"context": "What Timestamp method returns the wall-clock duration (in seconds) for the last batch?", "continuation": "batch_wct()"} +{"context": "What Timestamp method returns the current time in the specified unit?", "continuation": "get()"} +{"context": "What Timestamp method creates a new Timestamp, advanced to the next batch?", "continuation": "to_next_batch()"} +{"context": "What Timestamp method creates a new Timestamp, advanced to the next epoch?", "continuation": "to_next_epoc()"} +{"context": "What Timestamp method creates copy of the timestamp?", "continuation": "copy()"} +{"context": "What function ensures maybe_time is an instance of Time?", "continuation": "ensure_time()"} +{"context": "What object is an enum to represent which mode the Trainer is in?", "continuation": "TrainerMode"} +{"context": "What object is an enum to represent different memory formats?", "continuation": "MemoryFormat"} +{"context": "What function builds the transformation for the ADE20k dataset?", "continuation": "build_ade20k_transformations()"} +{"context": "What function builds the ADE20k dataloader?", "continuation": "build_ade20k_dataloader()"} +{"context": "What function builds an ADE20k streaming dataset?", "continuation": "build_streaming_ade20k_dataloader()"} +{"context": "What function builds a synthetic ADE20k dataloader?", "continuation": "build_synthetic_ade20k_dataloader()"} +{"context": "What object resizes the image and target to base_size scaled by a randomly sampled variable?", "continuation": "RandomResizePair"} +{"context": "What object crops the image and target at a randomly sampled position?", "continuation": "RandomCropPair"} +{"context": "What object flips the image and target horizontally with a specified probability?", "continuation": "RandomHFlipPair"} +{"context": "What object pads an image to a specified size?", "continuation": "PadToSize"} +{"context": "What object applies a combination of brightness, contrast, saturation, and hue jitters with random intensities?", "continuation": "PhotometricDistortion"} +{"context": "What object is a PyTorch Dataset for ADE20k", "continuation": "ADE20k"} +{"context": "What function builds a BRaTS dataloader?", "continuation": "build_brats_dataloader()"} +{"context": "What function creates a custom collate function to handle images with different depths?", "continuation": "_my_collate()"} +{"context": "What function builds a DataSpec for the StreamingC4 dataset?", "continuation": "build_streaming_c4_dataloader()"} +{"context": "What function builds a CIFAR-10 dataloader with default transforms?", "continuation": "build_cifar10_dataloader()"} +{"context": "What function builds an FFCV CIFAR10 dataloader?", "continuation": "build_ffcv_cifar10_dataloader()"} +{"context": "What function builds a synthetic CIFAR-10 dataset for debugging or profiling?", "continuation": "build_synthetic_cifar10_dataloader()"} +{"context": "What function builds a streaming CIFAR10 dataset?", "continuation": "build_streaming_cifar10_dataloader()"} +{"context": "What function converts PyTorch compatible dataset into FFCV format at filepath write_path", "continuation": "write_ffcv_dataset()"} +{"context": "What function builds a ImageNet dataloader?", "continuation": "build_imagenet_dataloader()"} +{"context": "What function builds a synthetic ImageNet dataloader?", "continuation": "build_synthetic_imagenet_dataloader()"} +{"context": "What function converts an ImageNet dataset to FFCV format?", "continuation": "write_ffcv_imagenet()"} +{"context": "What function builds a FFCV ImageNet dataloader?", "continuation": "build_ffcv_imagenet_dataloader()"} +{"context": "What function builds an imagenet1k streaming dataset?", "continuation": "build_streaming_imagenet1k_dataloader()"} +{"context": "What Dataset constructs batches for in-context learning question answering evaluation?", "continuation": "InContextLearningQATaskDataset"} +{"context": "What InContextLearningQATaskDataset method prepares a set of language modeling tasks into tokenized format with prompt and fewshot examples? Each task consists of a context and a continuation as well as an optional prompt and optional list of example context/continuation pairs which precede the test context/continuation pair.", "continuation": "prep_examples()"} +{"context": "What Dataset constructs batches for in-context learning language modeling evaluation?", "continuation": "InContextLearningLMTaskDataset"} +{"context": "What InContextLearningLMTaskDataset method prepares a set of language modeling tasks into tokenized format with prompt and fewshot examples? Each task consists of a context and a continuation as well as an optional prompt and optional list of example context/continuation pairs which precede the test context/continuation pair.", "continuation": "prep_examples()"} +{"context": "What Dataset constructs batches for in-context learning multiple choice evaluation?", "continuation": "InContextLearningMultipleChoiceTaskDataset"} +{"context": "What InContextLearningMultipleChoiceTaskDataset method prepares a set of language modeling tasks into tokenized format with prompt and fewshot examples? Each task consists of a context and a continuation as well as an optional prompt and optional list of example context/continuation pairs which precede the test context/continuation pair.", "continuation": "prep_examples()"} +{"context": "What Dataset constructs batches for in-context learning multiple choice evaluation?", "continuation": "InContextLearningMultipleChoiceTaskDataset"} +{"context": "What InContextLearningMultipleChoiceTaskDataset method prepares a set of multiple choice tasks into tokenized format with prompt and fewshot examples? Each question consists of a query and set of answer choices, only one of which is correct. At inference time we construct individual inference examples consisting of the query + a single choice, as well as an optional (prompt) and optional list of example query + correct answers, which precede the test query + choice.", "continuation": "prep_examples()"} +{"context": "What InContextLearningMultipleChoiceTaskDataset constructs batches for in-context learning schema evaluation?", "continuation": "InContextLearningSchemaTaskDataset"} +{"context": "What InContextLearningSchemaTaskDataset method prepares a set of schema questions into tokenized format with prompt and few shot examples. Each question consists of a set of possible contexts followed by a continuation, only one of the contexts would logically permit the continuation? At inference time we construct individual inference examples consisting of a single context option + the continuation, as well as an optional (prompt) and optional list of example correct context option + continuations, which precede the test context option + continuation. For schema, this method provides information relaying which of the answer choices is the correct one. This information is used for computing accuracy metrics.", "continuation": "prep_examples()"} +{"context": "What function partitions the dataset into a separate dataset for each category value in the data and write each partition to a local file in has_categories is enabled?", "continuation": "partition_dataset_by_category()"} +{"context": "What function constructs a dataloader (or dataloaders if has_categories is True) capable of evaluating LLMs on in-context learning language modeling tasks?", "continuation": "get_icl_task_dataloader()"} +{"context": "What function builds a dataloader for a generic language modeling dataset?", "continuation": "build_lm_dataloader()"} +{"context": "What function builds a MNIST dataloader?", "continuation": "build_mnist_dataloader()"} +{"context": "What function builds a syntheic MNIST dataset?", "continuation": "build_synthetic_mnist_dataloder()"} +{"context": "What object defines a class label type of the synthetic data?", "continuation": "SyntheticDataLabelType"} +{"context": "What Dataset emulates a dataset of provided size and shape?", "continuation": "SyntheticBatchPairDataset"} +{"context": "What Dataset yields samples of PIL.Image.Image and supports dataset transformations?", "continuation": "SyntheticPILDataset"} +{"context": "What object normalizes input data and removes the background class from target data if desired?", "continuation": "NormalizationFn"} +{"context": "What function constructs a length 2 tuple of torch.Tensors from datasets that yield samples of type PIL.Image.Image?", "continuation": "pil_image_collate()"} +{"context": "What function adds a transform to a dataset's collection of transforms?", "continuation": "add_vision_dataset_transform()"} +{"context": "What Device is an extension of ~composer.devices.device.Device for CPUs?", "continuation": "DeviceCPU"} +{"context": "What Device is an extension of ~composer.devices.device.Device for GPUs?", "continuation": "DeviceGPU"} +{"context": "What Device supports MPS for training on Apple's M-series chips?", "continuation": "DeviceMPS"} +{"context": "What Device is an extension of ~composer.devices.device.Device for TPUs?", "continuation": "DeviceTPU"} +{"context": "What object is an abstract class for a device on which a model runs?", "continuation": "Device"} +{"context": "What Device method moves a module onto a device?", "continuation": "module_to_device()"} +{"context": "What Device method moves a tensor onto a device?", "continuation": "tensor_to_device()"} +{"context": "What Device method moves all tensor items in a batch to a device?", "continuation": "batch_to_device()"} +{"context": "What Device method moves the optimizer's state onto a device?", "continuation": "optimizer_to_device()"} +{"context": "What function recursively maps a function to all items in a batch?", "continuation": "_map_batch()"} +{"context": "What LoggerDestination logs metrics to the console?", "continuation": "ConsoleLogger"} +{"context": "What LoggerDestination logs metrics to a file?", "continuation": "FileLogger"} +{"context": "What FileLogger method writes to the logfile?", "continuation": "write()"} +{"context": "What LoggerDestination logs metrics to dictionary objects that persist in memory throughout training?", "continuation": "InMemoryLogger"} +{"context": "What InMemoryLogger method returns logged data as a dict containing values of a desired metric over time?", "continuation": "get_timeseries()"} +{"context": "What object is the base class for logger destination?", "continuation": "LoggerDestination"} +{"context": "What LoggerDestination method logs hyperparameters, configurations, and settings that don't vary during the run?", "continuation": "log_hyperparameters()"} +{"context": "What LoggerDestination method logs metrics or parameters that vary during the run?", "continuation": "log_metrics()"} +{"context": "What LoggerDestination method logs traces or any debug-related data like algorithm traces?", "continuation": "log_traces()"} +{"context": "What LoggerDestination method logs images or any tensor/arrays as images?", "continuation": "log_images()"} +{"context": "What LoggerDestination method handles uploading a file stored at file_path to a file named remote_file_name?", "continuation": "upload_file()"} +{"context": "What LoggerDestination method handles downloading a file stored at remote_file_name to destination?", "continuation": ""} +{"context": "What LoggerDestination method indicates whether LoggerDestination can upload files?", "continuation": "can_upload_files()"} +{"context": "What object is an interface to record training data?", "continuation": "Logger"} +{"context": "What Logger method logs images or any tensors/arrays as images?", "continuation": "log_images()"} +{"context": "What Logger method uploads file_path as a file named remote_file_name?", "continuation": "upload_file()"} +{"context": "What Logger method determines if the logger has a destination which supports uploading files?", "continuation": "has_file_upload_destination()"} +{"context": "What function recursively formats a given log data value into a string", "continuation": "format_log_data_value()"} +{"context": "What LoggerDestination uses MLFlow?", "continuation": "MLFlowLogger"} +{"context": "What LoggerDestination uses Comet?", "continuation": "CometMLLogger"} +{"context": "What LoggerDestination uses MosaicML?", "continuation": "MosaicMLLogger"} +{"context": "What LoggerDestination logs metrics to the console and optionally show a progress bar?", "continuation": "ProgressBarLogger"} +{"context": "What LoggerDestination uploads (downloads) files to (from) a remote backend?", "continuation": "RemoteUploaderDownloader"} +{"context": "What RemoteUploaderDownloader method returns the ObjectStore instance for the main thread?", "continuation": "remote_backend()"} +{"context": "What RemoteUploaderDownloader method checks in all workers are alive?", "continuation": "_all_workers_alive()"} +{"context": "What RemoteUploaderDownloader method checks whether the logger supports uploading files?", "continuation": "can_upload_files()"} +{"context": "What RemoteUploaderDownloader method enqueues objects from self._logged_objects onto self._file_upload_queue and keeps self._enqueued_objects in sync with self._file_upload_queue by listening to self._completed_uploads?", "continuation": "_enqueue_uploads()"} +{"context": "What RemoteUploaderDownloader method waits for all tasks to be completed?", "continuation": "wait_for_workers()"} +{"context": "What RemoteUploaderDownloader method gets the object store provider uri for a remote file?", "continuation": "get_uri_for_file()"} +{"context": "What RemoteUploaderDownloader method formats the remote_file_name according to the file_path_format_string?", "continuation": "_remote_file_name()"} +{"context": "What function handles uploading files to the object store?", "continuation": "_upload_worker()"} +{"context": "What LoggerDestination logs metrics to Slack?", "continuation": "SlackLogger"} +{"context": "What SlackLogger method flushes the buffer to Slack if the buffer size exceeds max_logs_per_message?", "continuation": "_log_to_budder()"} +{"context": "What SlackLogger method returns the default formatter function if no formatter func is specified?", "continuation": "_default_log_bold_key_normal_value_pair_with_header()"} +{"context": "What SlackLogger method flushes buffered metadata to MosaicML", "continuation": "_flush_logs_to_slack()"} +{"context": "What LoggerDestination logs metrics to Tensorboard?", "continuation": "TensorboardLogger"} +{"context": "What LoggerDestination logs metrics to WandB?", "continuation": "WandBLogger"} +{"context": "What function returns a replacement for F.binary_cross_entropy_with_logits that handles class indices or one-hot label?", "continuation": "binary_cross_entropy_with_logits()"} +{"context": "What function returns a drop-in replacement for F.cross_entropy that handles class indices or one-hot labels?", "continuation": "soft_cross_entropy()"} +{"context": "What Loss criterion computes the dice loss between input and target", "continuation": "DiceLoss"} +{"context": "What function infers whether the target is in indices formate or one_hot format?", "continuation": "infer_target_type()"} +{"context": "What function ensures that the targets are in a one-hot format rather than an index format?", "continuation": "ensure_targets_one_hot()"} +{"context": "What function converts a tensor of index class labels to a tensor of one-hot class labels?", "continuation": "_one_hot()"} +{"context": "What object is a Dataclass to wrap the final mAP results?", "continuation": "MAPMetricResults"} +{"context": "What object moves logs to log.debug()", "continuation": "WriteToLog"} +{"context": "What object suppresses the default output of the pycocotools package?", "continuation": "_hide_prints"} +{"context": "What function ensures the correct input format of preds and targets?", "continuation": "_input_validator()"} +{"context": "What Metric computes the Mean_Average-Precission (mAP) and Mean-Average-Recall (mAR) for object detection predictions?", "continuation": "MAP"} +{"context": "What MAP method adds detections and groundtruth to the metric?", "continuation": "update()"} +{"context": "What MAP method computes the Mean-Average-Precision (mAP) and Mean-Average-Recall (mAR) scores?", "continuation": "compute()"} +{"context": "What MAP method transforms and returns all cached targets or predictions in COCO format", "continuation": "_get_coco_format()"} +{"context": "What Metric calculates the intersection area between the predicted class mask and the label class mask? The intersection is then divided by the area of the union of the predicted and label masks. This measures the quality of predicted class mask with respect to the label. The IoU for each class is then averaged and the final result is the mIoU score. Implementation is primarily?", "continuation": "MIoU"} +{"context": "What MIoU method updates the state with new predictions and targets?", "continuation": "update()"} +{"context": "What MIoU method aggregates state across all processes and computes final metrix?", "continuation": "compute()"} +{"context": "What Metric measures how similar predictions and targets are using Dice Coefficient?", "continuation": "Dice"} +{"context": "What Dice method updates the state based on new predictions and targets?", "continuation": "update()"} +{"context": "What Dice method aggregates state across all processes and computes final metrix?", "continuation": "compute()"} +{"context": "What Metric implements cross entropy loss as a torchmetrics.Metric so that it can be returned by the ComposerModel.metrics?", "continuation": "CrossEntropy"} +{"context": "What CrossEntropy method updates the state with new predictions and targets?", "continuation": "update()"} +{"context": "What CrossEntropy method aggregates state across all processes and computes final metrix?", "continuation": "compute()"} +{"context": "What Metric turns a torch.nn Loss Module into distributed torchmetrics Metric?", "continuation": "LossMetric"} +{"context": "What LossMetric method updates the state with new predictions and targets?", "continuation": "update()"} +{"context": "What LossMetric method aggregates state across all processes and computes final metrix?", "continuation": "compute()"} +{"context": "What Metric computes accuracy with support for masked indices?", "continuation": "MaskedAccuracy"} +{"context": "What Metric computes cross entropy on language modeling outputs?", "continuation": "LanguageCrossEntropy"} +{"context": "What LossMetric method updates the internal state with results from a new batch?", "continuation": "update()"} +{"context": "What LossMetric method aggregates the state over all processes to compute the metric?", "continuation": "compute()"} +{"context": "What Metric implements F1 Scores for binary classification tasks via sklearn?", "continuation": "BinaryF1Score"} +{"context": "What BinaryF1Score method updates the internal state with results from a new batch?", "continuation": "update()"} +{"context": "What BinaryF1Score method aggregates the state over all processes to compute the metric?", "continuation": "compute()"} +{"context": "What LanguageCrossEntropy is a subclasses composer.metrics.nlp.LanguageCrossEntropy to implement perplexity?", "continuation": "LanguagePerplexity"} +{"context": "What InContextLearningMetric computes accuracy for In-context learning (ICL) question answering (QA) tasks?", "continuation": "InContextLearningQAAccuracy"} +{"context": "What InContextLearningMetric computes accuracy for In-context learning (ICL) language modeling (LM) tasks?", "continuation": "LanguaInContextLearningLMAccuracygePerplexity"} +{"context": "What InContextLearningMetric computes accuracy for In-context learning (ICL) multiple choice (MC) tasks?", "continuation": "InContextLearningMultipleChoiceAccuracy"} +{"context": "What InContextLearningMetric is a generic class for Expected Calibration Error (ECE)?", "continuation": "InContextLearningExpectedCalibrationError"} +{"context": "What InContextLearningExpectedCalibrationError computes Expected Calibration Error (ECE) for In-context learning (ICL) multiple choice (MC) tasks?", "continuation": "InContextLearningMCExpectedCalibrationError"} +{"context": "What InContextLearningExpectedCalibrationError computes Expected Calibration Error (ECE) for In-context learning (ICL) language modeling (LM) tasks?", "continuation": "InContextLearningLMExpectedCalibrationError"} +{"context": "What function creates BERT model based on hugging_face Transformers?", "continuation": "create_bert_mlm()"} +{"context": "What function creates BERT classification model based on hugging_face Transformers?", "continuation": "create_bert_classification()"} +{"context": "What is the name of the toy convolutional neural network archetecture in pytorch for MNIST?", "continuation": "Model"} +{"context": "What function creates a ComposerClassifier with a simple convolutional neural network", "continuation": "mnist_model()"} +{"context": "What function builds a mmsegmentation DeepLabV3 model", "continuation": "deeplabv3()"} +{"context": "What function creates a ComposerClassifier with a DeepLabv3(+) model and logs Mean Intersection over Union (MIoU) and Cross Entropy during training and validation?", "continuation": "composer_deeplabv3()"} +{"context": "What function rounds number of channels after scaling with width multiplier?", "continuation": "def round_channels()"} +{"context": "What function calculates the amount of padding to use to get the SAME functionality in Tensorflow?", "continuation": "round_channels()"} +{"context": "What function randomly masks a set of samples and provides similar regularization as stochastic depth?", "continuation": "drop_connect()"} +{"context": "What module is a squeeze excite layer?", "continuation": "SqueezeExcite"} +{"context": "What module is Depthwise Separable Convolution layer?", "continuation": "DepthwiseSeparableConv"} +{"context": "What module is Mobile Inverted Residual Bottleneck Block?", "continuation": "MBConvBlock"} +{"context": "What module is EfficientNet model?", "continuation": "EfficientNet"} +{"context": "What EfficientNet method instantiate an EfficientNet model family member based on the model_name string?", "continuation": "get_model_from_name()"} +{"context": "What EfficientNet method decodes an EfficientNet block specification string into a dictionary of keyword arguments for a block in the architecture?", "continuation": "_decode_block_string()"} +{"context": "What function creates a ComposerClassifier object with an EfficientNet-b0 architecture?", "continuation": "composer_efficientnetb0()"} +{"context": "What function implements composer.models.huggingface.HuggingFaceModel to wrap Hugging Face GPT-2 transformers and logs training and validation perplexity?", "continuation": "create_gpt2()"} +{"context": "What function creates a ComposerClassifier object with a torchvision ResNet model?", "continuation": "composer_resnet()"} +{"context": "What function creates a ComposerClassifier object with a CIFAR ResNet models?", "continuation": "composer_resnet_cifar()"} +{"context": "What module is a residual neural network as originally designed for CIFAR-10?", "continuation": "ResNetCIFAR"} +{"context": "What module is a 9-layer residual network, excluding BatchNorms and activation functions?", "continuation": "ResNet9"} +{"context": "What ComposerModel is a convenience class that creates a ComposerModel for classification tasks from a vanilla PyTorch model? ComposerClassifier requires batches in the form: (``input``, ``target``) and includes a basic classification training loop with a loss function loss_fn which takes in the model's outputs and the labels.", "continuation": "ComposerClassifier"} +{"context": "What function creates a wrapper around timm.create_model()?", "continuation": "composer_timm()"} +{"context": "What function creates ComposerClassifier object using a ViT-S/16 model", "continuation": ""} +{"context": "What module is the interface needed to make a PyTorch model compatible with composer.Trainer", "continuation": "ComposerModel"} +{"context": "What ComposerModel method computes model output given a batch from the dataloader?", "continuation": "forward()"} +{"context": "What ComposerModel method compute the loss of the model given outputs from method forward() and a composer.core.types.Batch of data from the dataloader? The Trainer will call .backward() on the returned loss.", "continuation": "loss()"} +{"context": "What ComposerModel method run the evaluation forward pass?", "continuation": "eval_forward()"} +{"context": "What ComposerModel method gets the metrics?", "continuation": "get_metrics()"} +{"context": "What ComposerModel object acts as a wrapper class that converts HuggingFace transformers models to composer models?", "continuation": "HuggingFaceModel"} +{"context": "What HuggingFaceModel method loads a HuggingFace tokenizer from a loaded in hf state?", "continuation": "load_huggingface_tokenizer_from_saved_state()"} +{"context": "What HuggingFaceModel method loads a HuggingFace model class from a loaded in hf state?", "continuation": "load_huggingface_model_from_saved_state()"} +{"context": "What HuggingFaceModel method loads a HuggingFace model (and tokenizer if present) from a composer checkpoint?", "continuation": "hf_from_composer_checkpoint()"} +{"context": "What HuggingFaceModel method generates from the underlying HuggingFace model?", "continuation": "generate()"} +{"context": "What function returns True if model class is either a registered HuggingFace Causal LM or a subclass of one?", "continuation": "_is_registered_causal_lm()"} +{"context": "What function gets a HuggingFace config from a composer state dict with overrides applied?", "continuation": "get_hf_config_from_composer_state_dict()"} +{"context": "What function writes a config.json and pytorch_model.bin, like method transformers.PreTrainedModel.from_pretrained expects, from a composer checkpoint?", "continuation": "write_huggingface_pretrained_from_composer_checkpoint()"} +{"context": "What object sets the initialization scheme for different layers of a PyTorch model?", "continuation": "Initializer"} +{"context": "What ComposerModel object acts as a wrapper class that adapts mmdetection detectors to composer models?", "continuation": "MMDetModel"} +{"context": "What SGD object implements a SGD optimizer with the weight decay term decoupled from the learning rate?", "continuation": "DecoupledSGDW"} +{"context": "What DecoupledSGDW method performs SGDW algorithm computation?", "continuation": "sgdw()"} +{"context": "What DecoupledSGDW method performs a single optimization step?", "continuation": "step()"} +{"context": "What AdamW object implements a Adam optimizer with the weight decay term decoupled from the learning rate?", "continuation": "DecoupledAdamW"} +{"context": "What DecoupledAdamW method performs adamw algorithm computation?", "continuation": "adamw()"} +{"context": "What DecoupledAdamW method performs a single optimization step?", "continuation": "step()"} +{"context": "What Protocol object is a specification for a stateless scheduler function?", "continuation": "ComposerScheduler"} +{"context": "What ComposerScheduler object decays the learning rate discretely at fixed intervals?", "continuation": "StepScheduler"} +{"context": "What ComposerScheduler object decays the learning rate discretely at fixed milestones?", "continuation": "MultiStepScheduler"} +{"context": "What ComposerScheduler object maintains a fixed learning rate?", "continuation": "ConstantScheduler"} +{"context": "What ComposerScheduler object adjusts the learning rate linearly?", "continuation": "LinearScheduler"} +{"context": "What ComposerScheduler object decays the learning rate exponentially?", "continuation": "ExponentialScheduler"} +{"context": "What ComposerScheduler object decays the learning rate according to the decreasing part of a cosine curve?", "continuation": "CosineAnnealingScheduler"} +{"context": "What ComposerScheduler object cyclically decays the learning rate according to the decreasing part of a cosine curve?", "continuation": "CosineAnnealingWarmRestartsScheduler"} +{"context": "What ComposerScheduler object sets the learning rate to be proportional to a power of the fraction of training time left?", "continuation": "PolynomialScheduler"} +{"context": "What ComposerScheduler object decays the learning rate discretely at fixed milestones, with an initial warmup?", "continuation": "MultiStepWithWarmupScheduler"} +{"context": "What ComposerScheduler object maintains a fixed learning rate, with an initial warmup?", "continuation": "ConstantWithWarmupScheduler"} +{"context": "What ComposerScheduler object adjusts the learning rate linearly, with an initial warmup?", "continuation": "LinearWithWarmupScheduler"} +{"context": "What ComposerScheduler object decays the learning rate according to the decreasing part of a cosine curve, with an initial warmup?", "continuation": "CosineAnnealingWithWarmupScheduler"} +{"context": "What ComposerScheduler object decays the learning rate according to a power of the fraction of training time left, with an initial warmup?", "continuation": "PolynomialWithWarmupScheduler"} +{"context": "What TraceHandler object records trace events in Chrome JSON trace format?", "continuation": "JSONTraceHandler"} +{"context": "What TraceHandler method helps record an event in the trace?", "continuation": "_record_event()"} +{"context": "What function merges profiler output JSON trace files together", "continuation": "merge_traces()"} +{"context": "What object acts as a profiler marker?", "continuation": "Marker"} +{"context": "What Marker method records the start of a duration event?", "continuation": "start()"} +{"context": "What Marker method records the end of a duration event?", "continuation": "finish()"} +{"context": "What Marker method records an instant event?", "continuation": "instant()"} +{"context": "What Marker method records an counter event?", "continuation": "counter()"} +{"context": "What object defines whether or not events are being recorded to the trace file?", "continuation": "ProfilerAction"} +{"context": "What function returns a profiler schedule function for a cyclic profiling window?", "continuation": "cyclic_schedule"} +{"context": "What object acts as a composer profiler?", "continuation": "Profiler"} +{"context": "What Profiler method binds the profiler to the state?", "continuation": "bind_to_state()"} +{"context": "What Profiler records trace events in Chrome JSON format in the trace handlers?", "continuation": "record_chrome_json_trace_file"} +{"context": "What Profiler method creates and gets an instance of a Marker object?", "continuation": "marker()"} +{"context": "What Callback object records system level metrics?", "continuation": "SystemProfiler"} +{"context": "What Callback object profiles the execution using the PyTorch Profiler?", "continuation": "TorchProfiler"} +{"context": "What Callback object is the base class for Composer Profiler trace handlers?", "continuation": "TraceHandler"} +{"context": "What TraceHandler method invokes whenever there is a duration event to record?", "continuation": "process_duration_event()"} +{"context": "What TraceHandler method invokes whenever there is an instant event to record?", "continuation": "process_instant_event()"} +{"context": "What TraceHandler method invokes whenever there is an counter event to record?", "continuation": "process_counter_event()"} +{"context": "What TraceHandler method invokes when there are events in Chrome JSON format to record?", "continuation": "process_chrome_json_trace_file()"} +{"context": "What function parses the provided DeepSpeed config for compatibility with the Mosaic trainer?", "continuation": "_parse_deepspeed_config()"} +{"context": "What function ensures that a batch is properly formatted for DeepSpeed precisions, if active?", "continuation": "_fix_batch_precision_for_deepspeed()"} +{"context": "What function makes a learning rate schedule take a different number of epochs?", "continuation": "scale_pytorch_scheduler()"} +{"context": "What Gradscaler object allows for gradient scaling during with closures?", "continuation": "ClosureGradScaler"} +{"context": "What ClosureGradScaler method performs a step on the optimizer with amp?", "continuation": "step()"} +{"context": "What ClosureGradScaler method updates the scale factor?", "continuation": "update()"} +{"context": "What object do we use to perform gradient synchronization?", "continuation": "DDPSyncStrategy"} +{"context": "What DDPSyncStrategy method acts as a context manager for handling the DDPSyncStrategy?", "continuation": "ddp_sync_context()"} +{"context": "What DDPSyncStrategy method wraps the module in a torch.nn.parallel.DistributedDataParallel object if running distributed training?", "continuation": "prepare_ddp_module()"} +{"context": "What DDPSyncStrategy method helps recreate optimizer groups for FSDP wrapped modules?", "continuation": "_recreate_fsdp_param_groups_from_unwrapped_opt_info()"} +{"context": "What DDPSyncStrategy method prepares a module and optimizer for use with torch.distributed.fsdp.FullyShardedDataParallel?", "continuation": "prepare_fsdp_module()"} +{"context": "What function applies the function recursively to a module's children and the module itself?", "continuation": "meta_safe_apply()"} +{"context": "What function concatenates a list of strings together with a delimiter in between the strings in the list", "continuation": "concatenate_strings()"} +{"context": "What function configures cpu_offload?", "continuation": "get_cpu_offload()"} +{"context": "What function configures and/or retrieving process groups?", "continuation": "get_process_group()"} +{"context": "What function updates FSDPs _recursive_wrap to enable module_kwargs and custom process_group cache?", "continuation": "_custom_recursive_wrap()"} +{"context": "What FullyShardedDataParallel object updates _auto_wrap to enable module_kwargs?", "continuation": "MosaicFullyShardedDataParallel"} +{"context": "What function sets initial value of device_train_microbatch_size?", "continuation": "_get_initial_device_train_microbatch_size()"} +{"context": "What function determines if error is CUDA Out of Memory and if auto_microbatching is enabled?", "continuation": "_is_cuda_oom()"} +{"context": "What function adjusts device_train_microbatch_size if we encounter OOM?", "continuation": "_adjust_device_train_microbatch_size()"} +{"context": "What object trains models with Composer algorithms?", "continuation": "Trainer"} +{"context": "What Trainer method returns list of saved checkpoints?", "continuation": "saved_checkpoints()"} +{"context": "What Trainer method atempts to download the checkpoint from the logger destinations?", "continuation": "_try_checkpoint_download()"} +{"context": "What Trainer method determines the load path when using autoresume?", "continuation": "_get_autoresume_checkpoint()"} +{"context": "What Trainer method trains the model?", "continuation": "fit()"} +{"context": "What Trainer method shuts down trainer?", "continuation": "close()"} +{"context": "What Trainer method computes metrics, logs the results, and updates the state with the deep-copied metrics?", "continuation": "_compute_and_log_metrics()"} +{"context": "What Trainer method spins the dataloaders to restore sampler state for current epoch?", "continuation": "_spin_dataloaders_to_cur_epoch()"} +{"context": "What Trainer method accumulates the number of samples and tokens across ranks?", "continuation": "_accumulate_time_across_ranks()"} +{"context": "What Trainer method runs training for the specified number of epochs and log results?", "continuation": "_train_loop()"} +{"context": "What Trainer method runs evaluators periodically during training?", "continuation": "_run_evaluators()"} +{"context": "What Trainer method computes loss by training on a full batch of data?", "continuation": "_train_batch()"} +{"context": "What Trainer method iterates over microbatches and compute the loss that will be used to step the optimizer?", "continuation": "_train_microbatches()"} +{"context": "What Trainer method trains and computes the loss of state.batch, which is assumed to be a single microbatch?", "continuation": "_train_microbatch()"} +{"context": "What Trainer method outputs model prediction on the provided data?", "continuation": "predict()"} +{"context": "What Trainer method runs evaluation loop?", "continuation": "eval()"} +{"context": "What Trainer method evaluates the model and log appropriate metrics?", "continuation": "_eval_loop()"} +{"context": "What Trainer method determines based on precision when to use grad scaling?", "continuation": "_use_grad_scaling()"} +{"context": "What Trainer method iterates over the dataloader?", "continuation": "_iter_dataloader()"} +{"context": "What Trainer method determines based on precision and optimizers whether to use closures?", "continuation": "_use_closures()"} +{"context": "What Trainer method checkpoints the training State?", "continuation": "save_checkpoint()"} +{"context": "What Trainer method checkpoints the training State using a CheckpointSaver if it exists?", "continuation": "save_checkpoint_to_save_folder()"} +{"context": "What Trainer method exports a model for inference?", "continuation": "export_for_inference()"} +{"context": "What ObjectStore object acts as utility for uploading to and downloading from object (blob) stores, such as Amazon S3?", "continuation": "LibcloudObjectStore"} +{"context": "What RuntimeError object is a custom exception class to signify transient errors?", "continuation": "ObjectStoreTransientError"} +{"context": "What object is an abstract class for implementing object stores, such as LibcloudObjectStore and S3ObjectStore?", "continuation": "ObjectStore"} +{"context": "What ObjectStore method returns the URI for object_name?", "continuation": "get_uri()"} +{"context": "What ObjectStore method uploads an object currently located on a disk", "continuation": "upload_object()"} +{"context": "What ObjectStore method gets the size of an object, in bytes?", "continuation": "get_object_size()"} +{"context": "What ObjectStore method downloads an object to the specified destination path?", "continuation": "download_object()"} +{"context": "What ObjectStore method closes the object store?", "continuation": "close()"} +{"context": "What ObjectStore object acts as utility for uploading to and downloading from an OCI bucket?", "continuation": "OCIObjectStore()"} +{"context": "What ObjectStore object acts as utility for uploading to and downloading from an S3-compatible bucket using boto3?", "continuation": "S3ObjectStore()"} +{"context": "What ObjectStore object acts as utility for uploading to and downloading to a server via SFTP?", "continuation": "SFTPObjectStore()"} +{"context": "What function takes in local symbol table and recursively grabs any hyperparameter?", "continuation": "extract_hparams()"} +{"context": "What function parses objects for their hyperparameters going only one level deep?", "continuation": "_grab_hparams()"} +{"context": "What function returns best representation of object", "continuation": "_get_obj_repr()"} +{"context": "What function takes in a nested dict converts it to a flat dict with keys separated by slashes?", "continuation": "convert_nested_dict_to_flat_dict()"} +{"context": "What function converts flat dictionary separated by slashes to nested dictionary?", "continuation": "convert_flat_dict_to_nested_dict()"} +{"context": "What function indexes into the batch given the key?", "continuation": "batch_get()"} +{"context": "What function indexes into the batch given the key and sets the element at that index to value?", "continuation": "batch_set()"} +{"context": "What function sets a key value pair in a non-tuple batch?", "continuation": "_batch_set()"} +{"context": "What function sets multiple key value pairs in a non-tuple batch?", "continuation": "_batch_set_multiple()"} +{"context": "What function sets key value pairs in tuples and NamedTuples?", "continuation": "_batch_set_tuple()"} +{"context": "What function formats path with the rank zero values?", "continuation": "_format_path_with_rank_zero()"} +{"context": "What function formats path formatted with the current rank values?", "continuation": "_format_path_with_current_rank()"} +{"context": "What function gets the write mode to use with tarfile.open function?", "continuation": "_get_write_mode()"} +{"context": "What function loads a checkpoint from a local file, URI, or cloud object store into ``state``?", "continuation": "load_checkpoint()"} +{"context": "What function broadcasts the path from the LOCAL rank zero to all LOCAL ranks?", "continuation": "_get_local_rank_zero_path()"} +{"context": "What function downloads the checkpoint stored at path, potentially in object_store, to`node_checkpoint_folder?", "continuation": "download_checkpoint()"} +{"context": "What function provides a function which deletes all subparts of a dictionary based on a list of paths?", "continuation": "glob_filter()"} +{"context": "What function loads a torch checkpoint, catching errors due to backwards compatibility issues?", "continuation": "safe_torch_load()"} +{"context": "What function restores a checkpoint into state and returns the rng state dicts (if load_weights_only is False)?", "continuation": "_restore_checkpoint()"} +{"context": "What function replaces a file with its compressed version?", "continuation": "_compress_file()"} +{"context": "What function saves Deepspeed model and tarball the files?", "continuation": "_save_deepspeed_model()"} +{"context": "What function produces exception report (exception message + environment report)?", "continuation": "_exc_report()"} +{"context": "What function enables environment report generation on exception?", "continuation": "enable_env_report()"} +{"context": "What function disables environment report generation on exception?", "continuation": "disable_env_report()"} +{"context": "What function acts as a custom exception wrapper for sys.excepthook?", "continuation": "_custom_exception_handler()"} +{"context": "What function acts as a custom exception handler for IPython?", "continuation": "_nb_custom_exception_handler()"} +{"context": "What function collects and prints system information when the sys.excepthook function is called?", "continuation": "configure_excepthook()"} +{"context": "What function queries Torch system environment via torch.utils.collect_env?", "continuation": "get_torch_env()"} +{"context": "What function queries Composer pertinent system information as a dict?", "continuation": "get_composer_env_dict()"} +{"context": "What function queries Composer pertinent system information?", "continuation": "get_composer_env()"} +{"context": "What function generates system information report?", "continuation": "print_env()"} +{"context": "What function takes string or Device and returns the corresponding composer.devices.Device?", "continuation": "get_device()"} +{"context": "What function determines whether the module needed for training on TPUs—torch_xla—is installed?", "continuation": "is_tpu_installed()"} +{"context": "What function returns the world size, which is the number of processes participating in this training run?", "continuation": "get_world_size()"} +{"context": "What function returns the global rank of the current process?", "continuation": "get_global_rank()"} +{"context": "What function returns the local world size, which is the number of processes for the current node?", "continuation": "get_local_world_size()"} +{"context": "What function returns the local rank for the current process?", "continuation": "get_local_rank()"} +{"context": "What function returns the node rank?", "continuation": "get_node_rank()"} +{"context": "What function synchronizes all processes?", "continuation": "barrier()"} +{"context": "What function reduces a tensor by applying the reduce_operation?", "continuation": "all_reduce()"} +{"context": "What function broadcasts the tensor to the whole group?", "continuation": "broadcast()"} +{"context": "What function broadcasts picklable objects in object_list to the whole group?", "continuation": "broadcast_object_list()"} +{"context": "What function collects a torch.Tensor from each rank?", "continuation": "all_gather()"} +{"context": "What function collects a pickleable object from each rank and return a list of these objects indexed by rank?", "continuation": "all_gather_object()"} +{"context": "What function returns whether PyTorch was built with distributed support?", "continuation": "is_available()"} +{"context": "What function returns whether PyTorch distributed is initialized?", "continuation": "is_initialized()"} +{"context": "What function initializes the default PyTorch distributed process group?", "continuation": "initialize_dist()"} +{"context": "What function constructs a torch.utils.data.distributed.DistributedSampler for a dataset?", "continuation": "get_sampler()"} +{"context": "What function acts as a context manager to wait for a file to exist on all ranks except local rank zero?", "continuation": "local_rank_zero_download_and_wait()"} +{"context": "What function acts as a context manager to hold all non-zero ranks until rank zero completes?", "continuation": "run_local_rank_zero_first()"} +{"context": "What function returns a dict of distributed settings?", "continuation": "_get_dist_config()"} +{"context": "What function returns whether name has a tar-like extension?", "continuation": "is_tar()"} +{"context": "What function ensure that the given folder is empty?", "continuation": "ensure_folder_is_empty()"} +{"context": "What function ensure that the given folder does not have any files conflicting with the filename format string?", "continuation": "ensure_folder_has_no_conflicting_files()"} +{"context": "What function automatically creates an composer.utils.ObjectStore from supported URI formats?", "continuation": "maybe_create_object_store_from_uri()"} +{"context": "What function automatically creates a composer.loggers.RemoteUploaderDownloader from supported URI formats?", "continuation": "maybe_create_remote_uploader_downloader_from_uri()"} +{"context": "What function gets a file from a local folder, URL, or object store?", "continuation": "get_file()"} +{"context": "What function create a symlink file, which can be followed by get_file?", "continuation": "create_symlink_file()"} +{"context": "What function counts the number of instances of op in gm?", "continuation": "count_op_instances()"} +{"context": "What function replaces a single operator, torch method or function with another?", "continuation": "replace_op()"} +{"context": "What function walks backwards from nodeLHS and nodeRSH to the root and construct lists of their parents?", "continuation": "_get_residual_block_nodes()"} +{"context": "What function attaches tag to the given nodes for the splitter?", "continuation": "_attach_tag()"} +{"context": "What function tags nodes for splitting?", "continuation": "_tag_residual_nodes()"} +{"context": "What function returns GraphModules for the main and residual branches?", "continuation": "_get_residual_modules()"} +{"context": "What function replaces main, residual and add_node with the replacement_module?", "continuation": "_replace_residual_pattern()"} +{"context": "What function detects and replaces residual pattern with their stochastic equivalent?", "continuation": "apply_stochastic_residual()"} +{"context": "What function checks if all the linears have bias?", "continuation": "_can_linears_be_fused()"} +{"context": "What function checks if the linears can be fused?", "continuation": "_create_fused_linear()"} +{"context": "What function checks if there are parallel linears in the model and if so fuses them together?", "continuation": "fuse_parallel_linears()"} +{"context": "What ImportError handles errors for external packages that might not be installed?", "continuation": "MissingConditionalImportError"} +{"context": "What function dynamically imports a Python object?", "continuation": "import_object()"} +{"context": "What object contains supported export formats?", "continuation": "ExportFormat"} +{"context": "What function handles moving sample_input of various types to a device. If possible, avoids creating copies of the input?", "continuation": "_move_sample_input_to_device()"} +{"context": "What function exports a model for inference?", "continuation": "export_for_inference()"} +{"context": "What function helps export a model for inference?", "continuation": "export_with_logger()"} +{"context": "What function applies map_fn on each element in collection?", "continuation": "map_collection()"} +{"context": "What function converts input x into a tuple?", "continuation": "ensure_tuple()"} +{"context": "What class converts iterator of bytes into a file-like binary stream object?", "continuation": "IteratorFileStream"} +{"context": "What function invokes callback after each chunk is yielded from iterator?", "continuation": "iterate_with_callback()"} +{"context": "What function returns whether input model is an instance of a deepspeed.DeepSpeedEngine?", "continuation": "is_model_deepspeed()"} +{"context": "What function returns whether input model is an instance of a DistributedDataParallel?", "continuation": "is_model_ddp()"} +{"context": "What function returns whether input model is an instance of a FullyShardedDataParallel?", "continuation": "is_model_fsdp()"} +{"context": "What function returns whether Composer is running in a IPython/Jupyter Notebook?", "continuation": "is_notebook()"} +{"context": "What function forces Python warnings to consolidate into one line?", "continuation": "warning_on_one_line()"} +{"context": "What function gets free socket port to use as MASTER_PORT?", "continuation": "get_free_tcp_port()"} +{"context": "What funtion sets model.eval() for context duration, restoring model status at end?", "continuation": "model_eval_mode()"} +{"context": "What function checks the PyTorch version and compared it with version 2.0.0", "continuation": "using_torch_2()"} +{"context": "What function modifies model in-place by recursively applying replacement policies?", "continuation": "replace_module_classes()"} +{"context": "What function attempts to infer a module's device by inspecting its parameters and buffers?", "continuation": "_infer_device()"} +{"context": "What function counts the number of instances of module_class in module, recursively?", "continuation": "count_module_instances()"} +{"context": "What function counts instances of module_class in module, recursively, using a set to deduplicate?", "continuation": "_recur_count_module_instances()"} +{"context": "What function returns whether tensor is element for any element in iterable?", "continuation": "_tensor_in()"} +{"context": "What function returns the index of the optimizer param_group containing param?", "continuation": "_find_param_in_optimizer()"} +{"context": "What function returns first - second while maintaining the order in first?", "continuation": "_ordered_diff()"} +{"context": "What function removes old_params from the optimizers and insert new_params?", "continuation": "update_params_in_optimizer()"} +{"context": "What function configures PyTorch deterministic mode?", "continuation": "configure_deterministic_mode()"} +{"context": "What function gets a randomly created seed to use for seeding rng objects?", "continuation": "get_random_seed()"} +{"context": "What function seeds all rng objects?", "continuation": "seed_all()"} +{"context": "What function returns the state of the RNG objects?", "continuation": "get_rng_state()"} +{"context": "What function restores the RNG state?", "continuation": "load_rng_state()"} +{"context": "What function is the decorator to retry a function with backoff and jitter?", "continuation": "retry()"} \ No newline at end of file diff --git a/examples/end-to-end-examples/support_chatbot/mcli_yamls/conversion/convert_txt_to_stream.yaml b/examples/end-to-end-examples/support_chatbot/mcli_yamls/conversion/convert_txt_to_stream.yaml new file mode 100644 index 000000000..bc090580f --- /dev/null +++ b/examples/end-to-end-examples/support_chatbot/mcli_yamls/conversion/convert_txt_to_stream.yaml @@ -0,0 +1,28 @@ +name: convert-txt-to-stream + +compute: + gpus: 8 # Number of GPUs to use + + ## These configurations are optional + # cluster: r0z0 # Name of the cluster to use for this run + # gpu_type: a100_80gb # Type of GPU to use. + +integrations: +# Clone the examples repository so that we have access to the code in sec_10k_qa +- integration_type: git_repo + git_repo: YOUR_GITHUB_USERNAME/examples + #git_branch: support-bot + ssh_clone: false # Should be true if using a private repo + path: /workspace/examples # Tell MCLI what path to clone the repo to + +# cd into the chatbot folder +# Install the necessary dependencies +# Run the script to process the raw data files and upload them to the cloud in the correct format +command: | + cd /workspace/examples/examples/end-to-end-examples/support_chatbot/ + pip install -r requirements.txt + python scripts/conversion/convert_txt_to_stream.py \ + --out_root CLOUD://BUCKET/support-bot-demo/data/composer_30b/ \ + --in_root retrieval_data/composer + +image: mosaicml/pytorch:1.13.1_cu117-python3.10-ubuntu20.04 # Use the Docker image provided by MosaicML \ No newline at end of file diff --git a/examples/end-to-end-examples/support_chatbot/mcli_yamls/convert_checkpoint_to_huggingface.yaml b/examples/end-to-end-examples/support_chatbot/mcli_yamls/convert_checkpoint_to_huggingface.yaml new file mode 100644 index 000000000..e1ac55665 --- /dev/null +++ b/examples/end-to-end-examples/support_chatbot/mcli_yamls/convert_checkpoint_to_huggingface.yaml @@ -0,0 +1,29 @@ +name: convert_to_hf + +scheduling: + priority: medium + +compute: + gpus: 0 # Number of GPUs to use + + ## These configurations are optional + # cluster: r0z0 # Name of the cluster to use for this run + # gpu_type: a100_80gb # Type of GPU to use. + +integrations: +# Clone and install the llm-foundry repo so we can run scripts from it +- integration_type: git_repo + git_repo: mosaicml/llm-foundry + pip_install: -e . + ssh_clone: false # Should be true if using a private repo + git_commit: 68448b2764cf6988c830e4d55796e6e28cdac20e + +# cd into the scripts/inference folder and run the MPT conversion script from LLM-foundry +command: | + cd llm-foundry/scripts/inference + python convert_composer_to_hf.py \ + --composer_path CLOUD://BUCKET_NAME/support-bot-demo/checkpoints/CHECKPOINT_FOLDER_NAME/latest-rank0.pt.symlink \ + --hf_output_path s3://BUCKET_NAME/support-bot-demo/converted_checkpoints/HF_FOLDER_NAME/ \ + --output_precision bf16 \ + +image: mosaicml/pytorch:1.13.1_cu117-python3.10-ubuntu20.04 # Use the Docker image provided by MosaicML \ No newline at end of file diff --git a/examples/end-to-end-examples/support_chatbot/mcli_yamls/deploy_llm.yaml b/examples/end-to-end-examples/support_chatbot/mcli_yamls/deploy_llm.yaml new file mode 100644 index 000000000..8a9b1d1ee --- /dev/null +++ b/examples/end-to-end-examples/support_chatbot/mcli_yamls/deploy_llm.yaml @@ -0,0 +1,19 @@ +name: mpt-30b-composer-finetuned +compute: + gpus: 4 + instance: oci.bm.gpu.a10.4 +image: mosaicml/inference:0.1.29 +replicas: 1 +command: | + export PYTHONPATH=$PYTHONPATH:/code/examples:/code +integrations: +- integration_type: git_repo + git_repo: mosaicml/examples + ssh_clone: false + git_commit: df65ce9448f2e4c7803f7082930f80c8dc4e8fe1 +model: + download_parameters: + s3_path: s3://BUCKET_NAME/support-bot-demo/converted_checkpoints/HF_FOLDER_NAME/ + model_handler: examples.inference-deployments.mpt.mpt_handler.MPTModelHandler + model_parameters: + model_name: mosaicml/mpt-30b-chat diff --git a/examples/end-to-end-examples/support_chatbot/mcli_yamls/finetune/finetune_30b_chat.yaml b/examples/end-to-end-examples/support_chatbot/mcli_yamls/finetune/finetune_30b_chat.yaml new file mode 100644 index 000000000..a56b1d77d --- /dev/null +++ b/examples/end-to-end-examples/support_chatbot/mcli_yamls/finetune/finetune_30b_chat.yaml @@ -0,0 +1,201 @@ +name: mpt-30b-chat_composer_chatv2 + +compute: + gpus: 8 # Number of GPUs to use + + ## These configurations are optional + # cluster: r0z0 # Name of the cluster to use for this run + # gpu_type: h100_80gb # Type of GPU to use. + +integrations: +# Clone and install the llm-foundry repo so we can run scripts from it +- integration_type: git_repo + git_repo: mosaicml/llm-foundry + pip_install: -e .[gpu] + ssh_clone: false # Should be true if using a private repo + git_commit: 68448b2764cf6988c830e4d55796e6e28cdac20e + +# Uncomment and fill in to log to WandB. Also uncomment the loggers section near the bottom of the yaml +# - integration_type: wandb +# entity: mosaic-ml +# project: support-bot-demo-composer-chatv2 + +# cd into the llm-foundry/scripts directory and run the train.py script +command: | + cd llm-foundry/scripts + composer train/train.py /mnt/config/parameters.yaml || (echo "Command failed - killing python" && pkill python && exit 1) + +image: mosaicml/pytorch:1.13.1_cu117-python3.10-ubuntu20.04 # Use the Docker image provided by MosaicML + +# The parameters section is mounted to /mnt/config/parameters.yaml in the container +# The mounted file is then passed directly to the train.py script +# See LLM-foundry llmfoundry/scripts/train.py to see how the parameters are used in code +parameters: + # Path to load the weights from the previous step + load_path: CLOUD://BUCKET_NAME/support-bot-demo/checkpoints/mpt-30b-chat_composer-codebase/latest-rank0.pt.symlink + load_weights_only: true # Only load the weights for finetuning, discarding any other state from previous training + + # Checkpoint to local filesystem or remote object store + save_interval: 1ep # How frequently to save checkpoints + save_num_checkpoints_to_keep: 1 # Important, this cleans up checkpoints saved to DISK + save_folder: CLOUD://BUCKET_NAME/support-bot-demo/checkpoints/mpt-30b-chat_composer_chatv2/ + save_weights_only: true # Since we only need the weights for the next step, we can reduce the size of the checkpoint + + # Maximum sequence length of the model + # For MPT, you can change this to a different number if you would like to train on longer sequences + # Note that you would also need to reprocess your data to contain longer sequences + max_seq_len: 8192 + + # Random seed to ensure reproducibility + global_seed: 17 + + # Run Name + run_name: # If left blank, will be read from env var $RUN_NAME + + # Model + # This section is used by LLM-foundry to construct the model + model: + name: hf_causal_lm + pretrained: false + pretrained_model_name_or_path: mosaicml/mpt-30b + init_device: mixed + config_overrides: + max_seq_len: ${max_seq_len} + attn_config: + attn_impl: triton + attn_uses_sequence_id: false + + train_loader: + name: finetuning + dataset: + hf_name: sam-mosaic/chat-v2 + split: train + max_seq_len: ${max_seq_len} + allow_pad_trimming: false + decoder_only_format: true + # Use `python llmfoundry/data/packing.py --yaml-path /path/to/this/yaml/ ...` + # to profile this run's optimal packing_ratio as it depends on GPU count, + # batch size, sequence length. Turning on packing by setting packing_ratio + # here will pack multiple examples into one sequence for increased efficiency. + # For the mosaicml/dolly_hhrlhf and max sequence length 2048, + # 3 is a good packing_ratio that will not cause any examples to be trimmed. + # As an approximate rule of thumb, if you, for example, double max_seq_len you can double packing_ratio. + # packing_ratio: + shuffle: true + drop_last: true + num_workers: 8 + + eval_loader: + name: finetuning + dataset: + hf_name: sam-mosaic/chat-v2 + split: test + max_seq_len: ${max_seq_len} + allow_pad_trimming: false + decoder_only_format: true + # packing_ratio: + shuffle: true + drop_last: false + num_workers: 8 + + # Learning rate scheduler + # see LLM-foundry llmfoundry/utils/builders.py::build_scheduler for other built-in options + scheduler: + name: linear_decay_with_warmup + t_warmup: 50ba + alpha_f: 0 + + # Optimizer + # see LLM-foundry llmfoundry/utils/builders.py::build_optimizer for other built-in options + optimizer: + name: decoupled_lionw + lr: 0.0000005 + betas: + - 0.9 + - 0.99 + weight_decay: 0 + + + # Algorithms to apply + # see LLM-foundry llmfoundry/utils/builders.py::build_algorithm for other built-in options + algorithms: + gradient_clipping: + clipping_type: norm + clipping_threshold: 1.0 + + # Run configuration + max_duration: 1ep # Maximum duration of the run. Change to something shorter (e.g. 10ba) for a quick test run + eval_interval: 2000ba # How frequently to evaluate the model + eval_first: true # Whether to evaluate the model before training + eval_subset_num_batches: -1 # How many batches to evaluate on. -1 means evaluate on the entire dataset + global_train_batch_size: 64 # Global batch size. This is the batch size across all GPUs and should be 8*num_gpus + seed: ${global_seed} + device_eval_batch_size: 2 # Evaluation batch size per GPU + device_train_microbatch_size: 1 + precision: amp_bf16 + + # Configuration settings for FSDP + # https://docs.mosaicml.com/projects/composer/en/latest/notes/distributed_training.html#fullyshardeddataparallel-fsdp + # for more information about FSDP in Composer + fsdp_config: + sharding_strategy: FULL_SHARD + mixed_precision: PURE + activation_checkpointing: true + activation_checkpointing_reentrant: false + activation_cpu_offload: false + limit_all_gathers: true + verbose: false + + # Logging configuration + progress_bar: false + log_to_console: true + console_log_interval: 1ba + python_log_level: debug + + # Uncomment to log to WandB + # see LLM-foundry llmfoundry/utils/builders.py::build_logger for other built-in options + # loggers: + # wandb: {} + + # Callbacks + # see LLM-foundry llmfoundry/utils/builders.py::build_callback for other built-in options + callbacks: + # Periodically generate text from the model, uncomment only if you are logging to WandB + # generate_callback: + # batch_log_interval: 500 + # do_sample: true + # max_new_tokens: 100 + # prompts: + # - The quick brown fox jumps over + # - |- + # Vegan Banana Bread + # Instructions: + # 1. + # - The other day I was explaining what generative AI is to my five year old. + # - We are a global semiconductor company primarily offering + # - Our company had revenue of + # - Our business operations are subject to numerous risks, including + # - What was AMD's revenue in 2019? + # - What is net operating income? + # temperature: 1 + # top_k: 50 + # top_p: 0.95 + # use_cache: true + # Log information about the processing speed of the model + speed_monitor: + window_size: 10 + # Log the learning rate over the course of training + lr_monitor: {} + # Log information about the memory usage of the model + memory_monitor: {} + # Log an estimate of how long the training run has left to complete + runtime_estimator: {} + + icl_max_seq_len: 2048 + icl_tasks: + - + label: arc_easy + dataset_uri: eval/local_data/world_knowledge/arc_easy.jsonl + num_fewshot: [0] + icl_task_type: multiple_choice + continuation_delimiter: 'Answer: ' \ No newline at end of file diff --git a/examples/end-to-end-examples/support_chatbot/mcli_yamls/finetune/finetune_composer_codebase.yaml b/examples/end-to-end-examples/support_chatbot/mcli_yamls/finetune/finetune_composer_codebase.yaml new file mode 100644 index 000000000..c63707e57 --- /dev/null +++ b/examples/end-to-end-examples/support_chatbot/mcli_yamls/finetune/finetune_composer_codebase.yaml @@ -0,0 +1,204 @@ +name: mpt-30b_chat-composer + +compute: + gpus: 8 # Number of GPUs to use + + ## These configurations are optional + # cluster: r0z0 # Name of the cluster to use for this run + #gpu_type: h100_80gb # Type of GPU to use. + +integrations: +# Clone and install the llm-foundry repo so we can run scripts from it +- integration_type: git_repo + git_repo: mosaicml/llm-foundry + pip_install: -e .[gpu] + ssh_clone: false # Should be true if using a private repo + git_commit: 68448b2764cf6988c830e4d55796e6e28cdac20e + +# Uncomment and fill in to log to WandB. Also uncomment the loggers section near the bottom of the yaml +# - integration_type: wandb +# entity: mosaic-ml +# project: support-bot-demo-composer-codebase + +# cd into the llm-foundry/scripts directory and run the train.py script +command: | + cd llm-foundry/scripts + composer train/train.py /mnt/config/parameters.yaml || (echo "Command failed - killing python" && pkill python && exit 1) + +#image: "mosaicml/llm-foundry:2.0.1_cu118-latest" +image: mosaicml/pytorch:1.13.1_cu117-python3.10-ubuntu20.04 + +# The parameters section is mounted to /mnt/config/parameters.yaml in the container +# The mounted file is then passed directly to the train.py script +# See LLM-foundry llmfoundry/scripts/train.py to see how the parameters are used in code +parameters: + # Path to load the weights from the previous step + + # Where to read the data from and save it to locally on the machine + data_remote: CLOUD://BUCKET_NAME/support-bot-demo/data/composer_30b/ + data_local: ./local-dataset-composercodebase-cache/ + + # Checkpoint to local filesystem or remote object store + save_interval: 2ep # How frequently to save checkpoints + save_num_checkpoints_to_keep: 1 # Important, this cleans up checkpoints saved to DISK + save_folder: CLOUD://BUCKET_NAME/support-bot-demo/checkpoints/mpt-30b-chat_composer-codebase/ + save_weights_only: true # Since we only need the weights for the next step, we can reduce the size of the checkpoint + + # Maximum sequence length of the model + # For MPT, you can change this to a different number if you would like to train on longer sequences + # Note that you would also need to reprocess your data to contain longer sequences + max_seq_len: 8192 + + # Random seed to ensure reproducibility + global_seed: 15 + + # Run Name + run_name: # If left blank, will be read from env var $RUN_NAME + + # Model + # This section is used by LLM-foundry to construct the model + model: + name: hf_causal_lm + init_device: mixed # Initially only create the model on CPU once per node to reduce system memory requirements + pretrained_model_name_or_path: mosaicml/mpt-30b-chat # This can be changed to other models from the HuggingFace model hub + pretrained: true # If false, will just load the model architecture and randomly initialize the weights + config_overrides: # Override the default model config (comment this out if you change the model from MPT) + attn_config: + attn_impl: triton # Use the triton implementation of attention + attn_uses_sequence_id: false # Restrict attention to within each concatenated sequence + + # Tokenizer + # This section is used by LLM-foundry to construct the tokenizer + tokenizer: + name: mosaicml/mpt-30b-chat # This can be changed along with the model + kwargs: + model_max_length: ${max_seq_len} + + # Dataloaders + train_loader: + name: text + dataset: + local: ${data_local} + remote: ${data_remote} + split: train + shuffle: true + max_seq_len: ${max_seq_len} + shuffle_seed: ${global_seed} + eos_token_id: 0 + drop_last: true + num_workers: 8 + + eval_loader: + name: text + dataset: + local: ${data_local} + remote: ${data_remote} + split: validation + shuffle: false + max_seq_len: ${max_seq_len} + shuffle_seed: ${global_seed} + eos_token_id: 0 + drop_last: false + num_workers: 8 + + # Learning rate scheduler + # see LLM-foundry llmfoundry/utils/builders.py::build_scheduler for other built-in options + scheduler: + name: cosine_with_warmup + t_warmup: 10ba + alpha_f: 0.1 + + # Optimizer + # see LLM-foundry llmfoundry/utils/builders.py::build_optimizer for other built-in options + optimizer: + name: decoupled_lionw + lr: 0.0000001 + betas: + - 0.9 + - 0.99 + weight_decay: 0 + + # Algorithms to apply + # see https://docs.mosaicml.com/projects/composer/en/latest/trainer/algorithms.html + # for an explanation of algorithms in Composer + algorithms: + gradient_clipping: + clipping_type: norm + clipping_threshold: 1.0 + + # Run configuration + max_duration: 8ep # Maximum duration of the run. Change to something shorter (e.g. 10ba) for a quick test run + eval_interval: 2ep # How frequently to evaluate the model + eval_first: true # Whether to evaluate the model before training + eval_subset_num_batches: -1 # How many batches to evaluate on. -1 means evaluate on the entire dataset + global_train_batch_size: 64 # Global batch size. This is the batch size across all GPUs and should be 8*num_gpus + seed: ${global_seed} + device_eval_batch_size: 8 # Evaluation batch size per GPU + device_train_microbatch_size: 1 + precision: amp_bf16 + + # Configuration settings for FSDP + # https://docs.mosaicml.com/projects/composer/en/latest/notes/distributed_training.html#fullyshardeddataparallel-fsdp + # for more information about FSDP in Composer + fsdp_config: + sharding_strategy: FULL_SHARD + mixed_precision: PURE + activation_checkpointing: true + activation_checkpointing_reentrant: false + activation_cpu_offload: false + limit_all_gathers: true + verbose: false + + # Logging configuration + progress_bar: false + log_to_console: true + console_log_interval: 1ba + python_log_level: debug + + # Uncomment to log to WandB + # see LLM-foundry llmfoundry/utils/builders.py::build_logger for other built-in options + # loggers: + # wandb: {} + + # Callbacks + # see LLM-foundry llmfoundry/utils/builders.py::build_callbacks for other built-in options + callbacks: + # Periodically generate text from the model, uncomment only if you are logging to WandB + # generate_callback: + # batch_log_interval: 500 + # do_sample: true + # max_new_tokens: 100 + # prompts: + # - The quick brown fox jumps over + # - |- + # Vegan Banana Bread + # Instructions: + # 1. + # - The other day I was explaining what generative AI is to my five year old. + # - We are a global semiconductor company primarily offering + # - Our company had revenue of + # - Our business operations are subject to numerous risks, including + # temperature: 1 + # top_k: 50 + # top_p: 0.95 + # use_cache: true + # Log information about the processing speed of the model + speed_monitor: + window_size: 10 + # Log the learning rate over the course of training + lr_monitor: {} + # Log information about the memory usage of the model + memory_monitor: {} + # Log an estimate of how long the training run has left to complete + runtime_estimator: {} + + # In-context learning tasks to evaluate on + # We include one of the tasks from our evaluation suite here as an example + # see https://github.com/mosaicml/llm-foundry/tree/main/scripts/eval for more details on evaluation using LLM-foundry + icl_tasks: + - + label: arc_easy + dataset_uri: eval/local_data/world_knowledge/arc_easy.jsonl + num_fewshot: [0] + icl_task_type: multiple_choice + continuation_delimiter: 'Answer: ' \ No newline at end of file diff --git a/examples/end-to-end-examples/support_chatbot/repo_downloader.py b/examples/end-to-end-examples/support_chatbot/repo_downloader.py new file mode 100644 index 000000000..eb42c3a4b --- /dev/null +++ b/examples/end-to-end-examples/support_chatbot/repo_downloader.py @@ -0,0 +1,140 @@ +# Copyright 2022 MosaicML LLM Foundry authors +# SPDX-License-Identifier: Apache-2.0 + +import os +import shutil +import sys + +from git.repo import Repo + +class RepoDownloader: + """Downloads .md, .py, and .YAML files in git repositories to text files that + land in /scripts/train/support_chatbot/retrieval_data/{REPOSITORY_NAME} + + Args: + output_dir (str): The path of the directory where the downloaded repository will be saved + repo_url (str): The url for the git repository + + Attributes: + output_dir (str): The path of the directory where the downloaded repository will be saved + repo_url (str): The url for the git repository + repo_name (str): The name of the git repository + clone_dir (str): The path of the directory where the git repository will be cloned + + Raises: + ValueError: If the clone_dir (directory of os.path.join(current_dir, self.repo_name)) already exists + + Warning: + Make sure to use the actual github link (example: https://github.com/KuuCi/test_repository) + instead of the clone link which will end in '.git' + + Example: + .. testcode:: + + import sys + + for repo_url in sys.argv[1:]: + downloader = RepoDownloader(repo_url) + downloader.download_repo() + """ + + def __init__(self, + output_dir: str, + current_dir: str, + repo_url: str) -> None: + + self.output_dir = output_dir + self.repo_url = repo_url + self.repo_name = repo_url.split('/')[-1] + self.clone_dir = os.path.join(current_dir, self.repo_name) + + if os.path.exists(self.clone_dir): + raise ValueError(f"{self.clone_dir} already exists. Please choose a path that doesn't contain the repository name.") + + def get_github_file_url(self, file_path: str) -> str: + """Generate GitHub URL for a specific file in the repository.""" + relative_path = os.path.relpath(file_path, self.clone_dir) + # Ensure that the base GitHub URL is always included + github_file_url = f"https://github.com/{self.repo_url.split('/')[-2]}/{self.repo_name}/blob/main/{relative_path}" + return github_file_url + + + def prepare_output_file(self, file_path: str) -> str: + """Given the .py, .md, or .YAML file_path of the cloned git repository + file, returns the path of the new txt processed output file and creates + the new path's intermediate directory if it doesn't exist. + + Args: + file_path (str): the path of a .py, .md, or .yaml file in cloned repository + + Raises: + ValueError: If the file_path is not a .py, .md, or .yaml file + + Returns: + str: the path of the .txt version of that file + """ + _, ext = os.path.splitext(file_path) + if ext not in ['.yaml', '.py', '.md']: + raise ValueError(f'Unsupported file type: {ext}') + + github_url = self.get_github_file_url(file_path) + + # Convert the GitHub URL into the desired filename format + filename = github_url.replace("/", "{slash}").replace(".", "{dot}") + + output_file = os.path.join(self.output_dir, self.repo_name, filename + '.txt') + os.makedirs(os.path.dirname(output_file), exist_ok=True) + return output_file + + def file_to_txt(self, file_path: (str)) -> None: + """Given the file_path of a file in cloned repository, downloads it + to a .txt file and saves it in the same directory structure in + /scripts/train/support_chatbot/retrieval_data/{self.repo_name} + + Args: + file_path (str): the file_path of a .py file in cloned repository + """ + with open(file_path, 'r') as f: + code_content = f.read() + output_file = self.prepare_output_file(file_path) + with open(output_file, 'w') as out_file: + out_file.write(code_content) + + def download_repo(self) -> str: + """Given a git repository url clone the repository, then download all + repository .yaml, .py, and .md files as .txt files and save them in + /scripts/train/support_chatbot/retrieval_data/{self.repo_name} + + Returns: + The path of the downloaded repository (/scripts/train/support_chatbot/retrieval_data/{self.repo_name}) + """ + # Cloning the repo + Repo.clone_from(self.repo_url, self.clone_dir) + + # Downloading each file + for root, _, files in os.walk(self.clone_dir): + for file in files: + if file.endswith(('.yaml', '.py', '.md')): + full_file_path = os.path.join(root, file) + _, ext = os.path.splitext(full_file_path) + if ext == '.yaml' or ext == '.py' or ext == '.md': + self.file_to_txt(full_file_path) + else: + print(f'Unsupported file type: {ext}') + + shutil.rmtree(self.clone_dir) + return os.path.join(self.output_dir, self.repo_name) + +def main() -> None: + output_dir = 'retrieval_data' + if len(sys.argv) < 2: + raise ValueError("At least one repository URL must be provided as an argument.") + + for repo_url in sys.argv[1:]: + downloader = RepoDownloader(output_dir, "", repo_url) + if os.path.exists(downloader.clone_dir): + continue + downloader.download_repo() + +if __name__ == "__main__": + main() \ No newline at end of file diff --git a/examples/end-to-end-examples/support_chatbot/requirements-cpu.txt b/examples/end-to-end-examples/support_chatbot/requirements-cpu.txt new file mode 100644 index 000000000..d46af6eca --- /dev/null +++ b/examples/end-to-end-examples/support_chatbot/requirements-cpu.txt @@ -0,0 +1,8 @@ +langchain==0.0.205 +composer[streaming,libcloud,oci,nlp]==0.15.1 +mosaicml-cli==0.4.17 +gradio==3.33.1 +faiss-cpu==1.7.4 +sentencepiece==0.1.97 +oauthlib>=2.1.0,<3.0.0 +git+https://github.com/mosaicml/llm-foundry.git@68448b2764cf6988c830e4d55796e6e28cdac20e#egg=llm-foundry \ No newline at end of file diff --git a/examples/end-to-end-examples/support_chatbot/requirements.txt b/examples/end-to-end-examples/support_chatbot/requirements.txt new file mode 100644 index 000000000..4fff742d3 --- /dev/null +++ b/examples/end-to-end-examples/support_chatbot/requirements.txt @@ -0,0 +1,2 @@ +composer[nlp,streaming,wandb]==0.15.1 +git+https://github.com/mosaicml/llm-foundry.git@68448b2764cf6988c830e4d55796e6e28cdac20e#egg=llm-foundry \ No newline at end of file diff --git a/examples/end-to-end-examples/support_chatbot/scripts/conversion/convert_txt_to_stream.py b/examples/end-to-end-examples/support_chatbot/scripts/conversion/convert_txt_to_stream.py new file mode 100644 index 000000000..03ac064e9 --- /dev/null +++ b/examples/end-to-end-examples/support_chatbot/scripts/conversion/convert_txt_to_stream.py @@ -0,0 +1,232 @@ +# Copyright 2022 MosaicML Examples authors +# SPDX-License-Identifier: Apache-2.0 + +# Copyright 2022 MosaicML LLM Foundry authors +# SPDX-License-Identifier: Apache-2.0 + +import os +import random +from argparse import ArgumentParser, Namespace +from typing import Dict, Iterable, Optional +from llmfoundry.data import ConcatTokensDataset # type: ignore +from streaming import MDSWriter +from torch.utils.data import DataLoader, Dataset, get_worker_info +from tqdm import tqdm +from transformers import AutoTokenizer + + +def parse_args() -> Namespace: + """Parse commandline arguments.""" + parser = ArgumentParser( + description= + 'Convert dataset into MDS format, optionally concatenating and tokenizing' + ) + parser.add_argument( + '--max_workers', + type=int, + default=64, + required=False, + help='The maximum number of workers to use for MDS writing') + + parser.add_argument( + '--out_root', + type=str, + required=True, + help='The folder to write output to') + + parser.add_argument( + '--in_root', + type=str, + required=True, + help='The folder to read input from') + + parser.add_argument( + '--compression', + type=str, + default='zstd', + help='The compression algorithm to use for MDS writing') + + parser.add_argument( + '--concat_tokens', + type=int, + default=8192, + required=False, + help='Convert text to tokens and concatenate up to this many tokens') + + parser.add_argument( + '--tokenizer', + type=str, + default='mosaicml/mpt-30b', + required=False, + help='The name of the tokenizer to use') + parser.add_argument( + '--bos_text', + type=str, + default=None, + required=False, + help='The text to prepend to each example to separate concatenated examples') + parser.add_argument( + '--eos_text', + type=str, + default='<|endoftext|>', + required=False, + help='The text to append to each example to separate concatenated examples') + parser.add_argument( + '--no_wrap', + action='store_true', + required=False, + help='Whether to let text examples wrap across multiple training examples') + + parsed = parser.parse_args() + + # Make sure we have needed concat options + if (parsed.concat_tokens is not None and + isinstance(parsed.concat_tokens, int) and parsed.tokenizer is None): + parser.error( + 'When setting --concat_tokens, you must specify a --tokenizer') + + # now that we have validated them, change BOS/EOS to strings + if parsed.bos_text is None: + parsed.bos_text = '' + if parsed.eos_text is None: + parsed.eos_text = '' + return parsed + + +def build_dataloader(dataset: Dataset, batch_size: int) -> DataLoader: + return DataLoader( + dataset=dataset, + sampler=None, + batch_size=batch_size, + num_workers=8, + prefetch_factor=2, + ) + +def generate_samples( + loader: DataLoader, + truncate_num_samples: Optional[int] = None +) -> Iterable[Dict[str, bytes]]: + """Generator over samples of a dataloader. + + Args: + loader (DataLoader): A dataloader emitting batches like {key: [sample0_bytes, sample1_bytes, sample2_bytes, ...]} + truncate_num_samples (Optional[int]): An optional # of samples to stop at. + + Yields: + Sample dicts. + """ + n_samples = 0 + for batch in loader: + keys = list(batch.keys()) + current_bs = len(batch[keys[0]]) + for idx in range(current_bs): + if truncate_num_samples is not None and n_samples == truncate_num_samples: + return + n_samples += 1 + yield {k: v[idx] for k, v in batch.items()} + +class DatasetIterable: + def __init__(self, dataset: list[str]): + self.dataset = list(set(dataset)) # Remove duplicates + print(f'Total files in the dataset: {len(self.dataset)}') + + def __iter__(self): + worker_info = get_worker_info() + worker_id = worker_info.id if worker_info else 0 + num_workers = worker_info.num_workers if worker_info else 1 + string_shard = self.dataset[worker_id::num_workers] + print(f'Worker {worker_id} processing {len(string_shard)} files') + + for file in string_shard: + print(f'Processing file: {file}') + try: + with open(file, 'r') as f: + for line in f: + yield {'text': line.strip()} + except Exception as e: + print(f'Error processing file: {file}. Error: {e}') + + +def main( + input_folder: str, + output_folder: str, + tokenizer_name: str, + concat_tokens: int, + eos_text: str, + bos_text: str, + no_wrap: bool, + max_workers: int, + compression: str) -> None: + """Convert the generic txt dataset into MDS format. + + Args: + tokenizer_name (str): Name of tokenizer to use. + output_folder (str): Folder to write output to. + input_folder (str): Folder to read input from. + dataset_subset (str): Dataset subset to use. + concat_tokens (int): Number of tokens to concatenate. + eos_text (str): Text to append to end of each sample. + bos_text (str): Text to prepend to beginning of each sample. + no_wrap (bool): Whether to allow wrapping of text across samples. + max_workers (int): Max # of workers to use. + compression (str): Compression to use. + """ + tokenizer = AutoTokenizer.from_pretrained(tokenizer_name) + # we will enforce length, so suppress warnings about sequences too long for the model + tokenizer.model_max_length = int(1e30) + columns = {'tokens': 'bytes'} + + files = os.listdir(input_folder) + txt_files = [os.path.join(input_folder, f) for f in files if f.endswith('.txt')] + random.shuffle(txt_files) + num_files = len(txt_files) + + split_dataset = { + 'train': txt_files[:int(num_files * 0.8)], + 'validation': txt_files[int(num_files * 0.8):int(num_files * 0.9)], + 'test': txt_files[int(num_files * 0.9):] + } + + + for split in ['train', 'validation', 'test']: + print(f'Processing {split}') + data = split_dataset[split] + + # Use the ConcatTokensDataset from LLM-foundry to concatenate sequences of tokens up to the maximum sequence length + dataset = ConcatTokensDataset( + hf_dataset=DatasetIterable(dataset=data), + max_length=concat_tokens, + tokenizer=tokenizer, + eos_text=eos_text, + bos_text=bos_text, + no_wrap=no_wrap, + ) + + # Generate samples + loader = build_dataloader(dataset=dataset, batch_size=512) + samples = generate_samples(loader) + + # Write samples in MDS format + print(f'Converting to MDS format...') + with MDSWriter(out=os.path.join(output_folder, split), + max_workers=max_workers, + progress_bar=False, + columns=columns, + compression=compression) as out: + for sample in tqdm(samples): + out.write(sample) + + +if __name__ == '__main__': + args = parse_args() + main( + tokenizer_name=args.tokenizer, + output_folder=args.out_root, + input_folder=args.in_root, + concat_tokens=args.concat_tokens, + eos_text=args.eos_text, + bos_text=args.bos_text, + no_wrap=args.no_wrap, + max_workers=args.max_workers, + compression=args.compression, + ) \ No newline at end of file diff --git a/examples/end-to-end-examples/support_chatbot/scripts/deployment_download_helper.py b/examples/end-to-end-examples/support_chatbot/scripts/deployment_download_helper.py new file mode 100644 index 000000000..c57346eba --- /dev/null +++ b/examples/end-to-end-examples/support_chatbot/scripts/deployment_download_helper.py @@ -0,0 +1,49 @@ +# Copyright 2022 MosaicML Examples authors +# SPDX-License-Identifier: Apache-2.0 + +import os + +from composer.utils import maybe_create_object_store_from_uri, parse_uri + +# This api exists in llm-foundry repo +from scripts.inference.convert_hf_mpt_to_ft import convert_mpt_to_ft # isort: skip # yapf: disable # type: ignore + +LOCAL_BASE_FOLDER = '/downloaded_hf_checkpoint/' +# should match with LOCAL_CHECKPOINT_DIR in mpt_ft_handler.py +LOCAL_FT_FOLDER = '/tmp/mpt' + + +def download_and_convert(remote_uri: str, gpus: int = 1): + """Helper function used to download the model at startup time of an. + + inference deployment. + + It is specifically written for MPT, and the file list would need to be adapted to use with + a different model. + + Args: + remote_uri (str): Object store prefix of the folder containing the model files. + gpus (int): Number of gpus to use for inference + """ + object_store = maybe_create_object_store_from_uri(remote_uri) + assert object_store is not None # pyright + _, _, remote_base_key = parse_uri(remote_uri) + + # These files are hardcoded for MPT, and would need to be changed for a different model + files = [ + 'adapt_tokenizer.py', 'attention.py', 'blocks.py', 'config.json', + 'configuration_mpt.py', 'custom_embedding.py', 'flash_attn_triton.py', 'fc.py', + 'ffn.py', 'generation_config.json', 'hf_prefixlm_converter.py', + 'meta_init_context.py', 'modeling_mpt.py', 'norm.py', + 'param_init_fns.py', 'pytorch_model.bin', 'special_tokens_map.json', + 'tokenizer.json', 'tokenizer_config.json' + ] + os.makedirs(LOCAL_BASE_FOLDER, exist_ok=True) + for file in files: + object_store.download_object( + object_name=os.path.join(remote_base_key, file), + filename=os.path.join(LOCAL_BASE_FOLDER, file), + ) + convert_mpt_to_ft(model_name_or_path=LOCAL_BASE_FOLDER, + output_dir=LOCAL_FT_FOLDER, + infer_gpu_num=gpus) \ No newline at end of file diff --git a/examples/end-to-end-examples/support_chatbot/web_downloader.py b/examples/end-to-end-examples/support_chatbot/web_downloader.py new file mode 100644 index 000000000..18d5a82e4 --- /dev/null +++ b/examples/end-to-end-examples/support_chatbot/web_downloader.py @@ -0,0 +1,183 @@ +import urllib.request +from bs4 import BeautifulSoup +import os +import re +import html + + +all_links = [ + 'https://docs.mosaicml.com', + 'https://docs.mosaicml.com/projects/composer/', + 'https://docs.mosaicml.com/projects/composer/en/stable/getting_started/installation.html', + 'https://docs.mosaicml.com/projects/composer/en/stable/getting_started/quick_start.html', + 'https://docs.mosaicml.com/projects/composer/en/stable/getting_started/welcome_tour.html', + 'https://docs.mosaicml.com/projects/composer/en/stable/examples/getting_started.html', + 'https://docs.mosaicml.com/projects/composer/en/stable/examples/functional_api.html', + 'https://docs.mosaicml.com/projects/composer/en/stable/examples/medical_image_segmentation.html', + 'https://docs.mosaicml.com/projects/composer/en/stable/examples/custom_speedup_methods.html', + 'https://docs.mosaicml.com/projects/composer/en/stable/examples/ffcv_dataloaders.html', + 'https://docs.mosaicml.com/projects/composer/en/stable/examples/finetune_huggingface.html', + 'https://docs.mosaicml.com/projects/composer/en/stable/examples/pretrain_finetune_huggingface.html', + 'https://docs.mosaicml.com/projects/composer/en/stable/examples/migrate_from_ptl.html', + 'https://docs.mosaicml.com/projects/composer/en/stable/examples/early_stopping.html', + 'https://docs.mosaicml.com/projects/composer/en/stable/examples/auto_microbatching.html', + 'https://docs.mosaicml.com/projects/composer/en/stable/examples/checkpoint_autoresume.html', + 'https://docs.mosaicml.com/projects/composer/en/stable/examples/exporting_for_inference.html', + 'https://docs.mosaicml.com/projects/composer/en/stable/examples/TPU_Training_in_composer.html', + 'https://docs.mosaicml.com/projects/composer/en/stable/examples/training_with_submitit.html', + 'https://docs.mosaicml.com/projects/composer/en/stable/tutorials/train_resnet50_on_aws.html', + 'https://docs.mosaicml.com/projects/composer/en/stable/trainer/algorithms.html', + 'https://docs.mosaicml.com/projects/composer/en/stable/functional_api.html', + 'https://docs.mosaicml.com/projects/composer/en/stable/trainer/using_the_trainer.html', + 'https://docs.mosaicml.com/projects/composer/en/stable/composer_model.html', + 'https://docs.mosaicml.com/projects/composer/en/stable/trainer/dataloaders.html', + 'https://docs.mosaicml.com/projects/composer/en/stable/trainer/evaluation.html', + 'https://docs.mosaicml.com/projects/composer/en/stable/trainer/schedulers.html', + 'https://docs.mosaicml.com/projects/composer/en/stable/trainer/time.html', + 'https://docs.mosaicml.com/projects/composer/en/stable/trainer/events.html', + 'https://docs.mosaicml.com/projects/composer/en/stable/trainer/checkpointing.html', + 'https://docs.mosaicml.com/projects/composer/en/stable/trainer/logging.html', + 'https://docs.mosaicml.com/projects/composer/en/stable/trainer/file_uploading.html', + 'https://docs.mosaicml.com/projects/composer/en/stable/trainer/callbacks.html', + 'https://docs.mosaicml.com/projects/composer/en/stable/trainer/performance_tutorials/profiling.html', + 'https://docs.mosaicml.com/projects/composer/en/stable/trainer/performance_tutorials/analyzing_traces.html', + 'https://docs.mosaicml.com/projects/composer/en/stable/notes/distributed_training.html', + 'https://docs.mosaicml.com/projects/composer/en/stable/notes/early_stopping.html', + 'https://docs.mosaicml.com/projects/composer/en/stable/notes/numerics.html', + 'https://docs.mosaicml.com/projects/composer/en/stable/notes/auto_microbatching.html', + 'https://docs.mosaicml.com/projects/composer/en/stable/notes/resumption.html', + 'https://docs.mosaicml.com/projects/composer/en/stable/notes/tensorboard_logger.html', + 'https://docs.mosaicml.com/projects/composer/en/stable/notes/run_name.html', + 'https://docs.mosaicml.com/projects/composer/en/stable/method_cards/methods_overview.html' + 'https://docs.mosaicml.com/projects/mcli/', + 'https://docs.mosaicml.com/projects/mcli/en/latest/quick_start/getting_started.html', + 'https://docs.mosaicml.com/projects/mcli/en/latest/quick_start/environment.html', + 'https://docs.mosaicml.com/projects/mcli/en/latest/quick_start/quick_start_training.html', + 'https://docs.mosaicml.com/projects/mcli/en/latest/quick_start/quick_start_inference.html', + 'https://docs.mosaicml.com/projects/mcli/en/latest/quick_start/managing_clusters.html', + 'https://docs.mosaicml.com/projects/mcli/en/latest/training/common_commands.html', + 'https://docs.mosaicml.com/projects/mcli/en/latest/training/yaml_schema.html', + 'https://docs.mosaicml.com/projects/mcli/en/latest/training/run_lifecycle.html', + 'https://docs.mosaicml.com/projects/mcli/en/latest/training/working_with_runs.html', + 'https://docs.mosaicml.com/projects/mcli/en/latest/training/interactive.html', + 'https://docs.mosaicml.com/projects/mcli/en/latest/inference/inference_commands.html', + 'https://docs.mosaicml.com/projects/mcli/en/latest/inference/inference_schema.html', + 'https://docs.mosaicml.com/projects/mcli/en/latest/inference/working_with_deployments.html', + 'https://docs.mosaicml.com/projects/mcli/en/latest/inference/deployment_features.html', + 'https://docs.mosaicml.com/projects/mcli/en/latest/python/python_api.html', + 'https://docs.mosaicml.com/projects/mcli/en/latest/guides/first_llm.html', + 'https://docs.mosaicml.com/projects/mcli/en/latest/guides/sweeps.html', + 'https://docs.mosaicml.com/projects/mcli/en/latest/guides/advanced_sweeps_with_optuna.html', + 'https://docs.mosaicml.com/projects/mcli/en/latest/resources/integrations/git.html', + 'https://docs.mosaicml.com/projects/mcli/en/latest/resources/integrations/system_dependencies.html' + 'https://docs.mosaicml.com/projects/mcli/en/latest/resources/integrations/pypi.html', + 'https://docs.mosaicml.com/projects/mcli/en/latest/resources/integrations/wandb.html', + 'https://docs.mosaicml.com/projects/mcli/en/latest/resources/integrations/comet.html', + 'https://docs.mosaicml.com/projects/mcli/en/latest/resources/secrets/docker.html', + 'https://docs.mosaicml.com/projects/mcli/en/latest/resources/secrets/git.html', + 'https://docs.mosaicml.com/projects/mcli/en/latest/resources/secrets/env.html', + 'https://docs.mosaicml.com/projects/mcli/en/latest/resources/secrets/wandb.html', + 'https://docs.mosaicml.com/projects/mcli/en/latest/resources/secrets/mosaicml.html', + 'https://docs.mosaicml.com/projects/mcli/en/latest/resources/secrets/s3.html', + 'https://docs.mosaicml.com/projects/mcli/en/latest/resources/secrets/oci.html', + 'https://docs.mosaicml.com/projects/mcli/en/latest/resources/secrets/gcp.html', + 'https://docs.mosaicml.com/projects/mcli/en/latest/resources/secrets/coreweave.html', + 'https://docs.mosaicml.com/projects/mcli/en/latest/resources/secrets/cloudflare.html', + 'https://docs.mosaicml.com/projects/mcli/en/latest/resources/secrets/mounted.html', + 'https://docs.mosaicml.com/projects/mcli/en/latest/resources/secrets/ssh.html', + 'https://docs.mosaicml.com/projects/streaming/', + 'https://docs.mosaicml.com/projects/streaming/en/stable/getting_started/installation.html', + 'https://docs.mosaicml.com/projects/streaming/en/stable/getting_started/quick_start.html', + 'https://docs.mosaicml.com/projects/streaming/en/stable/getting_started/user_guide.html', + 'https://docs.mosaicml.com/projects/streaming/en/stable/fundamentals/dataset_format.html', + 'https://docs.mosaicml.com/projects/streaming/en/stable/fundamentals/dataset_conversion_guide.html', + 'https://docs.mosaicml.com/projects/streaming/en/stable/fundamentals/compression.html', + 'https://docs.mosaicml.com/projects/streaming/en/stable/fundamentals/hashing.html', + 'https://docs.mosaicml.com/projects/streaming/en/stable/fundamentals/environments.html', + 'https://docs.mosaicml.com/projects/streaming/en/stable/fundamentals/shuffling.html', + 'https://docs.mosaicml.com/projects/streaming/en/stable/fundamentals/sampling.html', + 'https://docs.mosaicml.com/projects/streaming/en/stable/how_to_guides/configure_cloud_storage_credentials.html', + 'https://docs.mosaicml.com/projects/streaming/en/stable/how_to_guides/dataset_conversion_to_mds_format.html', + 'https://docs.mosaicml.com/projects/streaming/en/stable/examples/cifar10.html', + 'https://docs.mosaicml.com/projects/streaming/en/stable/examples/facesynthetics.html', + 'https://docs.mosaicml.com/projects/streaming/en/stable/examples/synthetic_nlp.html', + 'https://docs.mosaicml.com/projects/streaming/en/stable/examples/multiprocess_dataset_conversion.html', +] + +class WebScraper: + def __init__(self, + path: str, + target_links: list[str] = all_links): + self.target_links = target_links + self.destination_folder =os.path.join(path, 'scraped') + + if not os.path.exists(self.destination_folder): + os.makedirs(self.destination_folder) + + def _clean_text(self, text: str) -> str: + """ + Cleans the extracted text by removing excessive newlines and spaces. + """ + text = re.sub(r'\n+', '\n', text) + text = text.strip() # Remove starting and ending white spaces + return text + + def _extract_codecells(self, soup: BeautifulSoup) -> list[str]: + code_blocks = [] + + for pre_tag in soup.find_all('pre', id=lambda x: x and x.startswith('codecell')): + # Combining the text from each span within the pre tag + code_text = ''.join(span.get_text() for span in pre_tag.find_all('span')) + code_blocks.append(code_text) + + return code_blocks + + @staticmethod + def url_to_filename(url: str) -> str: + return url.replace('/', '{slash}').replace('.', '{dot}').replace(':', '{colon}') + + def scrape(self) -> None: + for link in self.target_links: + self._save_content_from_link(link) + + def _save_content_from_link(self, link: str) -> None: + try: + link_response = urllib.request.urlopen(link) + except urllib.error.HTTPError as e: + if e.code == 404: # Not Found + return + else: + raise # You might want to consider propagating the exception for other HTTP errors. + except Exception as e: + return + + link_content = link_response.read().decode('utf-8') + + # Detect content type based on file extension or MIME type + if link.endswith(".html") or "text/html" in link_response.headers.get('Content-Type', ''): + parser_type = 'html.parser' + else: + parser_type = 'xml' + + soup_content = BeautifulSoup(link_content, parser_type) + + # Extract the 'codecell' div content if present + code_cells = self._extract_codecells(soup_content) + + # Extract relevant textual content + text_sections = soup_content.find_all(['p', 'h1', 'h2', 'h3', 'h4', 'h5', 'h6', 'li']) + text_content = "\n".join(section.get_text() for section in text_sections) + + # Add the highlights (code snippets) to the text content + text_content += "\n\n" + "\n\n".join(code_cells) + + # Clean the text content for better readability + text_content = self._clean_text(text_content) + + # Unescape HTML entities for HTML content + if parser_type == 'html.parser': + text_content = html.unescape(text_content) + + filename = os.path.join(self.destination_folder, self.url_to_filename(link) + '.txt') + with open(filename, 'w') as file: + file.write(text_content) \ No newline at end of file