modity code structure

hiyouga · Jul 15, 2023 · f751376 · f751376
1 parent 2a0f1f8
commit f751376
Show file tree

Hide file tree

Showing 57 changed files with 2,002 additions and 1,819 deletions.
diff --git a/README.md b/README.md
@@ -95,7 +95,7 @@ huggingface-cli login
 - Python 3.8+ and PyTorch 1.13.1+
 - 🤗Transformers, Datasets, Accelerate, PEFT and TRL
 - jieba, rouge-chinese and nltk (used at evaluation)
-- gradio and mdtex2html (used in web_demo.py)
+- gradio and matplotlib (used in web_demo.py)
 - uvicorn, fastapi and sse-starlette (used in api_demo.py)
 
 And **powerful GPUs**!
@@ -137,7 +137,8 @@ python -m transformers.models.llama.convert_llama_weights_to_hf \
 ### (Continually) Pre-Training
 
 ```bash
-CUDA_VISIBLE_DEVICES=0 python src/train_pt.py \
+CUDA_VISIBLE_DEVICES=0 python src/train_bash.py \
+    --stage pt \
     --model_name_or_path path_to_your_model \
     --do_train \
     --dataset wiki_demo \
@@ -158,7 +159,8 @@ CUDA_VISIBLE_DEVICES=0 python src/train_pt.py \
 ### Supervised Fine-Tuning
 
 ```bash
-CUDA_VISIBLE_DEVICES=0 python src/train_sft.py \
+CUDA_VISIBLE_DEVICES=0 python src/train_bash.py \
+    --stage sft \
     --model_name_or_path path_to_your_model \
     --do_train \
     --dataset alpaca_gpt4_en \
@@ -179,7 +181,8 @@ CUDA_VISIBLE_DEVICES=0 python src/train_sft.py \
 ### Reward Model Training
 
 ```bash
-CUDA_VISIBLE_DEVICES=0 python src/train_rm.py \
+CUDA_VISIBLE_DEVICES=0 python src/train_bash.py \
+    --stage rm \
     --model_name_or_path path_to_your_model \
     --do_train \
     --dataset comparison_gpt4_en \
@@ -199,7 +202,8 @@ CUDA_VISIBLE_DEVICES=0 python src/train_rm.py \
 ### PPO Training (RLHF)
 
 ```bash
-CUDA_VISIBLE_DEVICES=0 python src/train_ppo.py \
+CUDA_VISIBLE_DEVICES=0 python src/train_bash.py \
+    --stage ppo \
     --model_name_or_path path_to_your_model \
     --do_train \
     --dataset alpaca_gpt4_en \
@@ -222,7 +226,7 @@ CUDA_VISIBLE_DEVICES=0 python src/train_ppo.py \
 
 ```bash
 accelerate config # configure the environment
-accelerate launch src/train_XX.py # arguments (same as above)
+accelerate launch src/train_bash.py # arguments (same as above)
 ```
 
 <details><summary>Example configuration for full-tuning with DeepSpeed ZeRO-2</summary>
@@ -256,7 +260,8 @@ use_cpu: false
 ### Evaluation (BLEU and ROUGE_CHINESE)
 
 ```bash
-CUDA_VISIBLE_DEVICES=0 python src/train_sft.py \
+CUDA_VISIBLE_DEVICES=0 python src/train_bash.py \
+    --stage pt \
     --model_name_or_path path_to_your_model \
     --do_eval \
     --dataset alpaca_gpt4_en \

diff --git a/pyproject.toml b/pyproject.toml
@@ -0,0 +1,3 @@
+[build-system]
+requires = ["setuptools>=61.0"]
+build-backend = "setuptools.build_meta"
diff --git a/requirements.txt b/requirements.txt
@@ -8,8 +8,9 @@ sentencepiece
 jieba
 rouge-chinese
 nltk
-gradio
-mdtex2html
+gradio>=3.36.0
 uvicorn
+pydantic==1.10.7
 fastapi
 sse-starlette
+matplotlib
diff --git a/setup.py b/setup.py
@@ -0,0 +1,55 @@
+import os
+import re
+from setuptools import setup, find_packages
+
+
+def get_version():
+    with open(os.path.join("src", "llmtuner", "__init__.py"), "r", encoding="utf-8") as f:
+        file_content = f.read()
+        pattern = r"{0}\W*=\W*\"([^\"]+)\"".format("__version__")
+        version, = re.findall(pattern, file_content)
+        return version
+
+
+def get_requires():
+    with open("requirements.txt", "r", encoding="utf-8") as f:
+        file_content = f.read()
+        lines = [line.strip() for line in file_content.strip().split("\n") if not line.startswith("#")]
+        return lines
+
+
+def main():
+
+    setup(
+        name="llmtuner",
+        version=get_version(),
+        author="hiyouga",
+        author_email="hiyouga" "@" "buaa.edu.cn",
+        description="Easy-to-use fine-tuning framework using PEFT",
+        long_description=open("README.md", "r", encoding="utf-8").read(),
+        long_description_content_type="text/markdown",
+        keywords=["LLaMA", "BLOOM", "Falcon", "LLM", "ChatGPT", "transformer", "pytorch", "deep learning"],
+        license="Apache 2.0 License",
+        url="https://github.com/hiyouga/LLaMA-Efficient-Tuning",
+        package_dir={"": "src"},
+        packages=find_packages("src"),
+        python_requires=">=3.8.0",
+        install_requires=get_requires(),
+        classifiers=[
+            "Development Status :: 3 - Alpha",
+            "Intended Audience :: Developers",
+            "Intended Audience :: Education",
+            "Intended Audience :: Science/Research",
+            "License :: OSI Approved :: Apache Software License",
+            "Operating System :: OS Independent",
+            "Programming Language :: Python :: 3",
+            "Programming Language :: Python :: 3.8",
+            "Programming Language :: Python :: 3.9",
+            "Programming Language :: Python :: 3.10",
+            "Topic :: Scientific/Engineering :: Artificial Intelligence",
+        ]
+    )
+
+
+if __name__ == "__main__":
+    main()
diff --git a/src/api_demo.py b/src/api_demo.py
@@ -4,225 +4,11 @@
 # Visit http://localhost:8000/docs for document.
 
 
-import time
-import torch
 import uvicorn
-from threading import Thread
-from pydantic import BaseModel, Field
-from fastapi import FastAPI, HTTPException
-from fastapi.middleware.cors import CORSMiddleware
-from contextlib import asynccontextmanager
-from transformers import TextIteratorStreamer
-from sse_starlette import EventSourceResponse
-from typing import Any, Dict, List, Literal, Optional
 
-from utils import (
-    Template,
-    load_pretrained,
-    prepare_infer_args,
-    get_logits_processor
-)
-
-
-@asynccontextmanager
-async def lifespan(app: FastAPI): # collects GPU memory
-    yield
-    if torch.cuda.is_available():
-        torch.cuda.empty_cache()
-        torch.cuda.ipc_collect()
-
-
-app = FastAPI(lifespan=lifespan)
-
-
-app.add_middleware(
-    CORSMiddleware,
-    allow_origins=["*"],
-    allow_credentials=True,
-    allow_methods=["*"],
-    allow_headers=["*"],
-)
-
-
-class ModelCard(BaseModel):
-    id: str
-    object: Optional[str] = "model"
-    created: Optional[int] = Field(default_factory=lambda: int(time.time()))
-    owned_by: Optional[str] = "owner"
-    root: Optional[str] = None
-    parent: Optional[str] = None
-    permission: Optional[list] = []
-
-
-class ModelList(BaseModel):
-    object: Optional[str] = "list"
-    data: Optional[List[ModelCard]] = []
-
-
-class ChatMessage(BaseModel):
-    role: Literal["user", "assistant", "system"]
-    content: str
-
-
-class DeltaMessage(BaseModel):
-    role: Optional[Literal["user", "assistant", "system"]] = None
-    content: Optional[str] = None
-
-
-class ChatCompletionRequest(BaseModel):
-    model: str
-    messages: List[ChatMessage]
-    temperature: Optional[float] = None
-    top_p: Optional[float] = None
-    n: Optional[int] = 1
-    max_tokens: Optional[int] = None
-    stream: Optional[bool] = False
-
-
-class ChatCompletionResponseChoice(BaseModel):
-    index: int
-    message: ChatMessage
-    finish_reason: Literal["stop", "length"]
-
-
-class ChatCompletionResponseStreamChoice(BaseModel):
-    index: int
-    delta: DeltaMessage
-    finish_reason: Optional[Literal["stop", "length"]] = None
-
-
-class ChatCompletionResponseUsage(BaseModel):
-    prompt_tokens: int
-    completion_tokens: int
-    total_tokens: int
-
-
-class ChatCompletionResponse(BaseModel):
-    id: Optional[str] = "chatcmpl-default"
-    object: Literal["chat.completion"]
-    created: Optional[int] = Field(default_factory=lambda: int(time.time()))
-    model: str
-    choices: List[ChatCompletionResponseChoice]
-    usage: ChatCompletionResponseUsage
-
-
-class ChatCompletionStreamResponse(BaseModel):
-    id: Optional[str] = "chatcmpl-default"
-    object: Literal["chat.completion.chunk"]
-    created: Optional[int] = Field(default_factory=lambda: int(time.time()))
-    model: str
-    choices: List[ChatCompletionResponseStreamChoice]
-
-
-@app.get("/v1/models", response_model=ModelList)
-async def list_models():
-    global model_args
-    model_card = ModelCard(id="gpt-3.5-turbo")
-    return ModelList(data=[model_card])
-
-
-@app.post("/v1/chat/completions", response_model=ChatCompletionResponse)
-async def create_chat_completion(request: ChatCompletionRequest):
-    global model, tokenizer, source_prefix, generating_args
-
-    if request.messages[-1].role != "user":
-        raise HTTPException(status_code=400, detail="Invalid request")
-    query = request.messages[-1].content
-
-    prev_messages = request.messages[:-1]
-    if len(prev_messages) > 0 and prev_messages[0].role == "system":
-        prefix = prev_messages.pop(0).content
-    else:
-        prefix = source_prefix
-
-    history = []
-    if len(prev_messages) % 2 == 0:
-        for i in range(0, len(prev_messages), 2):
-            if prev_messages[i].role == "user" and prev_messages[i+1].role == "assistant":
-                history.append([prev_messages[i].content, prev_messages[i+1].content])
-
-    inputs = tokenizer([prompt_template.get_prompt(query, history, prefix)], return_tensors="pt")
-    inputs = inputs.to(model.device)
-
-    gen_kwargs = generating_args.to_dict()
-    gen_kwargs.update({
-        "input_ids": inputs["input_ids"],
-        "temperature": request.temperature if request.temperature else gen_kwargs["temperature"],
-        "top_p": request.top_p if request.top_p else gen_kwargs["top_p"],
-        "logits_processor": get_logits_processor()
-    })
-
-    if request.max_tokens:
-        gen_kwargs.pop("max_length", None)
-        gen_kwargs["max_new_tokens"] = request.max_tokens
-
-    if request.stream:
-        generate = predict(gen_kwargs, request.model)
-        return EventSourceResponse(generate, media_type="text/event-stream")
-
-    generation_output = model.generate(**gen_kwargs)
-    outputs = generation_output.tolist()[0][len(inputs["input_ids"][0]):]
-    response = tokenizer.decode(outputs, skip_special_tokens=True)
-
-    usage = ChatCompletionResponseUsage(
-        prompt_tokens=len(inputs["input_ids"][0]),
-        completion_tokens=len(outputs),
-        total_tokens=len(inputs["input_ids"][0]) + len(outputs)
-    )
-
-    choice_data = ChatCompletionResponseChoice(
-        index=0,
-        message=ChatMessage(role="assistant", content=response),
-        finish_reason="stop"
-    )
-
-    return ChatCompletionResponse(model=request.model, choices=[choice_data], usage=usage, object="chat.completion")
-
-
-async def predict(gen_kwargs: Dict[str, Any], model_id: str):
-    global model, tokenizer
-
-    streamer = TextIteratorStreamer(tokenizer, timeout=60.0, skip_prompt=True, skip_special_tokens=True)
-    gen_kwargs["streamer"] = streamer
-
-    thread = Thread(target=model.generate, kwargs=gen_kwargs)
-    thread.start()
-
-    choice_data = ChatCompletionResponseStreamChoice(
-        index=0,
-        delta=DeltaMessage(role="assistant"),
-        finish_reason=None
-    )
-    chunk = ChatCompletionStreamResponse(model=model_id, choices=[choice_data], object="chat.completion.chunk")
-    yield chunk.json(exclude_unset=True, ensure_ascii=False)
-
-    for new_text in streamer:
-        if len(new_text) == 0:
-            continue
-
-        choice_data = ChatCompletionResponseStreamChoice(
-            index=0,
-            delta=DeltaMessage(content=new_text),
-            finish_reason=None
-        )
-        chunk = ChatCompletionStreamResponse(model=model_id, choices=[choice_data], object="chat.completion.chunk")
-        yield chunk.json(exclude_unset=True, ensure_ascii=False)
-
-    choice_data = ChatCompletionResponseStreamChoice(
-        index=0,
-        delta=DeltaMessage(),
-        finish_reason="stop"
-    )
-    chunk = ChatCompletionStreamResponse(model=model_id, choices=[choice_data], object="chat.completion.chunk")
-    yield chunk.json(exclude_unset=True, ensure_ascii=False)
-    yield "[DONE]"
+from llmtuner import create_app
 
 
 if __name__ == "__main__":
-    model_args, data_args, finetuning_args, generating_args = prepare_infer_args()
-    model, tokenizer = load_pretrained(model_args, finetuning_args)
-
-    prompt_template = Template(data_args.prompt_template)
-    source_prefix = data_args.source_prefix if data_args.source_prefix else ""
-
+    app = create_app()
     uvicorn.run(app, host="0.0.0.0", port=8000, workers=1)
diff --git a/src/cli_demo.py b/src/cli_demo.py
@@ -2,21 +2,15 @@
 # Implements stream chat in command line for fine-tuned models.
 # Usage: python cli_demo.py --model_name_or_path path_to_model --checkpoint_dir path_to_checkpoint
 
-
-from utils import (
-    Template,
-    load_pretrained,
-    prepare_infer_args,
-    get_logits_processor
-)
 from threading import Thread
 from transformers import TextIteratorStreamer
 
+from llmtuner import Template, get_infer_args, load_model_and_tokenizer, get_logits_processor
 
-def main():
 
-    model_args, data_args, finetuning_args, generating_args = prepare_infer_args()
-    model, tokenizer = load_pretrained(model_args, finetuning_args)
+def main():
+    model_args, data_args, finetuning_args, generating_args = get_infer_args()
+    model, tokenizer = load_model_and_tokenizer(model_args, finetuning_args)
 
     prompt_template = Template(data_args.prompt_template)
     source_prefix = data_args.source_prefix if data_args.source_prefix else ""