Skip to content

Commit

Permalink
modity code structure
Browse files Browse the repository at this point in the history
  • Loading branch information
hiyouga committed Jul 15, 2023
1 parent 2a0f1f8 commit f751376
Show file tree
Hide file tree
Showing 57 changed files with 2,002 additions and 1,819 deletions.
19 changes: 12 additions & 7 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -95,7 +95,7 @@ huggingface-cli login
- Python 3.8+ and PyTorch 1.13.1+
- 🤗Transformers, Datasets, Accelerate, PEFT and TRL
- jieba, rouge-chinese and nltk (used at evaluation)
- gradio and mdtex2html (used in web_demo.py)
- gradio and matplotlib (used in web_demo.py)
- uvicorn, fastapi and sse-starlette (used in api_demo.py)

And **powerful GPUs**!
Expand Down Expand Up @@ -137,7 +137,8 @@ python -m transformers.models.llama.convert_llama_weights_to_hf \
### (Continually) Pre-Training

```bash
CUDA_VISIBLE_DEVICES=0 python src/train_pt.py \
CUDA_VISIBLE_DEVICES=0 python src/train_bash.py \
--stage pt \
--model_name_or_path path_to_your_model \
--do_train \
--dataset wiki_demo \
Expand All @@ -158,7 +159,8 @@ CUDA_VISIBLE_DEVICES=0 python src/train_pt.py \
### Supervised Fine-Tuning

```bash
CUDA_VISIBLE_DEVICES=0 python src/train_sft.py \
CUDA_VISIBLE_DEVICES=0 python src/train_bash.py \
--stage sft \
--model_name_or_path path_to_your_model \
--do_train \
--dataset alpaca_gpt4_en \
Expand All @@ -179,7 +181,8 @@ CUDA_VISIBLE_DEVICES=0 python src/train_sft.py \
### Reward Model Training

```bash
CUDA_VISIBLE_DEVICES=0 python src/train_rm.py \
CUDA_VISIBLE_DEVICES=0 python src/train_bash.py \
--stage rm \
--model_name_or_path path_to_your_model \
--do_train \
--dataset comparison_gpt4_en \
Expand All @@ -199,7 +202,8 @@ CUDA_VISIBLE_DEVICES=0 python src/train_rm.py \
### PPO Training (RLHF)

```bash
CUDA_VISIBLE_DEVICES=0 python src/train_ppo.py \
CUDA_VISIBLE_DEVICES=0 python src/train_bash.py \
--stage ppo \
--model_name_or_path path_to_your_model \
--do_train \
--dataset alpaca_gpt4_en \
Expand All @@ -222,7 +226,7 @@ CUDA_VISIBLE_DEVICES=0 python src/train_ppo.py \

```bash
accelerate config # configure the environment
accelerate launch src/train_XX.py # arguments (same as above)
accelerate launch src/train_bash.py # arguments (same as above)
```

<details><summary>Example configuration for full-tuning with DeepSpeed ZeRO-2</summary>
Expand Down Expand Up @@ -256,7 +260,8 @@ use_cpu: false
### Evaluation (BLEU and ROUGE_CHINESE)
```bash
CUDA_VISIBLE_DEVICES=0 python src/train_sft.py \
CUDA_VISIBLE_DEVICES=0 python src/train_bash.py \
--stage pt \
--model_name_or_path path_to_your_model \
--do_eval \
--dataset alpaca_gpt4_en \
Expand Down
3 changes: 3 additions & 0 deletions pyproject.toml
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
[build-system]
requires = ["setuptools>=61.0"]
build-backend = "setuptools.build_meta"
5 changes: 3 additions & 2 deletions requirements.txt
Original file line number Diff line number Diff line change
Expand Up @@ -8,8 +8,9 @@ sentencepiece
jieba
rouge-chinese
nltk
gradio
mdtex2html
gradio>=3.36.0
uvicorn
pydantic==1.10.7
fastapi
sse-starlette
matplotlib
55 changes: 55 additions & 0 deletions setup.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,55 @@
import os
import re
from setuptools import setup, find_packages


def get_version():
with open(os.path.join("src", "llmtuner", "__init__.py"), "r", encoding="utf-8") as f:
file_content = f.read()
pattern = r"{0}\W*=\W*\"([^\"]+)\"".format("__version__")
version, = re.findall(pattern, file_content)
return version


def get_requires():
with open("requirements.txt", "r", encoding="utf-8") as f:
file_content = f.read()
lines = [line.strip() for line in file_content.strip().split("\n") if not line.startswith("#")]
return lines


def main():

setup(
name="llmtuner",
version=get_version(),
author="hiyouga",
author_email="hiyouga" "@" "buaa.edu.cn",
description="Easy-to-use fine-tuning framework using PEFT",
long_description=open("README.md", "r", encoding="utf-8").read(),
long_description_content_type="text/markdown",
keywords=["LLaMA", "BLOOM", "Falcon", "LLM", "ChatGPT", "transformer", "pytorch", "deep learning"],
license="Apache 2.0 License",
url="https://github.com/hiyouga/LLaMA-Efficient-Tuning",
package_dir={"": "src"},
packages=find_packages("src"),
python_requires=">=3.8.0",
install_requires=get_requires(),
classifiers=[
"Development Status :: 3 - Alpha",
"Intended Audience :: Developers",
"Intended Audience :: Education",
"Intended Audience :: Science/Research",
"License :: OSI Approved :: Apache Software License",
"Operating System :: OS Independent",
"Programming Language :: Python :: 3",
"Programming Language :: Python :: 3.8",
"Programming Language :: Python :: 3.9",
"Programming Language :: Python :: 3.10",
"Topic :: Scientific/Engineering :: Artificial Intelligence",
]
)


if __name__ == "__main__":
main()
218 changes: 2 additions & 216 deletions src/api_demo.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,225 +4,11 @@
# Visit http://localhost:8000/docs for document.


import time
import torch
import uvicorn
from threading import Thread
from pydantic import BaseModel, Field
from fastapi import FastAPI, HTTPException
from fastapi.middleware.cors import CORSMiddleware
from contextlib import asynccontextmanager
from transformers import TextIteratorStreamer
from sse_starlette import EventSourceResponse
from typing import Any, Dict, List, Literal, Optional

from utils import (
Template,
load_pretrained,
prepare_infer_args,
get_logits_processor
)


@asynccontextmanager
async def lifespan(app: FastAPI): # collects GPU memory
yield
if torch.cuda.is_available():
torch.cuda.empty_cache()
torch.cuda.ipc_collect()


app = FastAPI(lifespan=lifespan)


app.add_middleware(
CORSMiddleware,
allow_origins=["*"],
allow_credentials=True,
allow_methods=["*"],
allow_headers=["*"],
)


class ModelCard(BaseModel):
id: str
object: Optional[str] = "model"
created: Optional[int] = Field(default_factory=lambda: int(time.time()))
owned_by: Optional[str] = "owner"
root: Optional[str] = None
parent: Optional[str] = None
permission: Optional[list] = []


class ModelList(BaseModel):
object: Optional[str] = "list"
data: Optional[List[ModelCard]] = []


class ChatMessage(BaseModel):
role: Literal["user", "assistant", "system"]
content: str


class DeltaMessage(BaseModel):
role: Optional[Literal["user", "assistant", "system"]] = None
content: Optional[str] = None


class ChatCompletionRequest(BaseModel):
model: str
messages: List[ChatMessage]
temperature: Optional[float] = None
top_p: Optional[float] = None
n: Optional[int] = 1
max_tokens: Optional[int] = None
stream: Optional[bool] = False


class ChatCompletionResponseChoice(BaseModel):
index: int
message: ChatMessage
finish_reason: Literal["stop", "length"]


class ChatCompletionResponseStreamChoice(BaseModel):
index: int
delta: DeltaMessage
finish_reason: Optional[Literal["stop", "length"]] = None


class ChatCompletionResponseUsage(BaseModel):
prompt_tokens: int
completion_tokens: int
total_tokens: int


class ChatCompletionResponse(BaseModel):
id: Optional[str] = "chatcmpl-default"
object: Literal["chat.completion"]
created: Optional[int] = Field(default_factory=lambda: int(time.time()))
model: str
choices: List[ChatCompletionResponseChoice]
usage: ChatCompletionResponseUsage


class ChatCompletionStreamResponse(BaseModel):
id: Optional[str] = "chatcmpl-default"
object: Literal["chat.completion.chunk"]
created: Optional[int] = Field(default_factory=lambda: int(time.time()))
model: str
choices: List[ChatCompletionResponseStreamChoice]


@app.get("/v1/models", response_model=ModelList)
async def list_models():
global model_args
model_card = ModelCard(id="gpt-3.5-turbo")
return ModelList(data=[model_card])


@app.post("/v1/chat/completions", response_model=ChatCompletionResponse)
async def create_chat_completion(request: ChatCompletionRequest):
global model, tokenizer, source_prefix, generating_args

if request.messages[-1].role != "user":
raise HTTPException(status_code=400, detail="Invalid request")
query = request.messages[-1].content

prev_messages = request.messages[:-1]
if len(prev_messages) > 0 and prev_messages[0].role == "system":
prefix = prev_messages.pop(0).content
else:
prefix = source_prefix

history = []
if len(prev_messages) % 2 == 0:
for i in range(0, len(prev_messages), 2):
if prev_messages[i].role == "user" and prev_messages[i+1].role == "assistant":
history.append([prev_messages[i].content, prev_messages[i+1].content])

inputs = tokenizer([prompt_template.get_prompt(query, history, prefix)], return_tensors="pt")
inputs = inputs.to(model.device)

gen_kwargs = generating_args.to_dict()
gen_kwargs.update({
"input_ids": inputs["input_ids"],
"temperature": request.temperature if request.temperature else gen_kwargs["temperature"],
"top_p": request.top_p if request.top_p else gen_kwargs["top_p"],
"logits_processor": get_logits_processor()
})

if request.max_tokens:
gen_kwargs.pop("max_length", None)
gen_kwargs["max_new_tokens"] = request.max_tokens

if request.stream:
generate = predict(gen_kwargs, request.model)
return EventSourceResponse(generate, media_type="text/event-stream")

generation_output = model.generate(**gen_kwargs)
outputs = generation_output.tolist()[0][len(inputs["input_ids"][0]):]
response = tokenizer.decode(outputs, skip_special_tokens=True)

usage = ChatCompletionResponseUsage(
prompt_tokens=len(inputs["input_ids"][0]),
completion_tokens=len(outputs),
total_tokens=len(inputs["input_ids"][0]) + len(outputs)
)

choice_data = ChatCompletionResponseChoice(
index=0,
message=ChatMessage(role="assistant", content=response),
finish_reason="stop"
)

return ChatCompletionResponse(model=request.model, choices=[choice_data], usage=usage, object="chat.completion")


async def predict(gen_kwargs: Dict[str, Any], model_id: str):
global model, tokenizer

streamer = TextIteratorStreamer(tokenizer, timeout=60.0, skip_prompt=True, skip_special_tokens=True)
gen_kwargs["streamer"] = streamer

thread = Thread(target=model.generate, kwargs=gen_kwargs)
thread.start()

choice_data = ChatCompletionResponseStreamChoice(
index=0,
delta=DeltaMessage(role="assistant"),
finish_reason=None
)
chunk = ChatCompletionStreamResponse(model=model_id, choices=[choice_data], object="chat.completion.chunk")
yield chunk.json(exclude_unset=True, ensure_ascii=False)

for new_text in streamer:
if len(new_text) == 0:
continue

choice_data = ChatCompletionResponseStreamChoice(
index=0,
delta=DeltaMessage(content=new_text),
finish_reason=None
)
chunk = ChatCompletionStreamResponse(model=model_id, choices=[choice_data], object="chat.completion.chunk")
yield chunk.json(exclude_unset=True, ensure_ascii=False)

choice_data = ChatCompletionResponseStreamChoice(
index=0,
delta=DeltaMessage(),
finish_reason="stop"
)
chunk = ChatCompletionStreamResponse(model=model_id, choices=[choice_data], object="chat.completion.chunk")
yield chunk.json(exclude_unset=True, ensure_ascii=False)
yield "[DONE]"
from llmtuner import create_app


if __name__ == "__main__":
model_args, data_args, finetuning_args, generating_args = prepare_infer_args()
model, tokenizer = load_pretrained(model_args, finetuning_args)

prompt_template = Template(data_args.prompt_template)
source_prefix = data_args.source_prefix if data_args.source_prefix else ""

app = create_app()
uvicorn.run(app, host="0.0.0.0", port=8000, workers=1)
14 changes: 4 additions & 10 deletions src/cli_demo.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,21 +2,15 @@
# Implements stream chat in command line for fine-tuned models.
# Usage: python cli_demo.py --model_name_or_path path_to_model --checkpoint_dir path_to_checkpoint


from utils import (
Template,
load_pretrained,
prepare_infer_args,
get_logits_processor
)
from threading import Thread
from transformers import TextIteratorStreamer

from llmtuner import Template, get_infer_args, load_model_and_tokenizer, get_logits_processor

def main():

model_args, data_args, finetuning_args, generating_args = prepare_infer_args()
model, tokenizer = load_pretrained(model_args, finetuning_args)
def main():
model_args, data_args, finetuning_args, generating_args = get_infer_args()
model, tokenizer = load_model_and_tokenizer(model_args, finetuning_args)

prompt_template = Template(data_args.prompt_template)
source_prefix = data_args.source_prefix if data_args.source_prefix else ""
Expand Down
Loading

0 comments on commit f751376

Please sign in to comment.