Skip to content

Commit

Permalink
Merge pull request #357 from foundation-model-stack/v2.0.0-rc2
Browse files Browse the repository at this point in the history
release: merge set of changes for v2.0.0
  • Loading branch information
Abhishek-TAMU authored Sep 30, 2024
2 parents 16543ee + a03e58f commit 3b150ab
Show file tree
Hide file tree
Showing 45 changed files with 99,317 additions and 348 deletions.
35 changes: 35 additions & 0 deletions .github/workflows/labelpr.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,35 @@
name: Label PRs

on:
pull_request_target:
types: [opened, edited, synchronize, reopened]

jobs:
label_pr:
runs-on: ubuntu-latest
steps:
- uses: actions/github-script@v3
with:
github-token: ${{ secrets.GITHUB_TOKEN }}
script: |
const pr_welcome_msg = `Thanks for making a pull request! 😃\nOne of the maintainers will review and advise on the next steps.`;
// https://github.com/commitizen/conventional-commit-types
const valid_pr_types = ['feat', 'fix', 'docs', 'style', 'refactor', 'perf', 'test', 'build', 'ci', 'chore', 'revert'];
if(context.payload.pull_request.comments === 0) {
await github.issues.createComment({ ...context.repo, issue_number: context.payload.number, body: pr_welcome_msg});
}
const title = context.payload.pull_request.title;
const results = /^(\w+)(\(\w+\))?!?:/.exec(title);
if (results === null) return core.setFailed(`The title does not follow conventional commits spec: https://www.conventionalcommits.org/en/v1.0.0/#summary Title: ${title}`);
const pr_type = results[1];
core.info(`pr_type: ${pr_type}`);
if (!valid_pr_types.includes(pr_type)) return core.setFailed(`Unknown pull request type: ${pr_type}`);
const labels = context.payload.pull_request.labels;
const new_labels = labels.filter(label => !valid_pr_types.includes(label.name)); // keep all labels that are not in valid_pr_types
new_labels.push({name: pr_type});
await github.issues.update({ ...context.repo, issue_number: context.payload.number, labels: new_labels });
2 changes: 1 addition & 1 deletion .pylintrc
Original file line number Diff line number Diff line change
Expand Up @@ -333,7 +333,7 @@ indent-string=' '
max-line-length=100

# Maximum number of lines in a module.
max-module-lines=1100
max-module-lines=1200

# Allow the body of a class to be on the same line as the declaration if body
# contains single statement.
Expand Down
198 changes: 180 additions & 18 deletions README.md

Large diffs are not rendered by default.

44 changes: 30 additions & 14 deletions build/Dockerfile
Original file line number Diff line number Diff line change
Expand Up @@ -19,17 +19,21 @@ ARG USER=tuning
ARG USER_UID=1000
ARG PYTHON_VERSION=3.11
ARG WHEEL_VERSION=""
## Enable Aimstack if requested via ENABLE_AIM set to "true"
ARG ENABLE_AIM=false
ARG ENABLE_FMS_ACCELERATION=true

## Base Layer ##################################################################
FROM registry.access.redhat.com/ubi9/ubi:${BASE_UBI_IMAGE_TAG} as base
FROM registry.access.redhat.com/ubi9/ubi:${BASE_UBI_IMAGE_TAG} AS base

ARG PYTHON_VERSION
ARG USER
ARG USER_UID

# Note this works for 3.9, 3.11, 3.12
RUN dnf remove -y --disableplugin=subscription-manager \
subscription-manager \
&& dnf install -y python${PYTHON_VERSION} procps \
&& dnf install -y python${PYTHON_VERSION} procps g++ python${PYTHON_VERSION}-devel \
&& ln -s /usr/bin/python${PYTHON_VERSION} /bin/python \
&& python -m ensurepip --upgrade \
&& python -m pip install --upgrade pip \
Expand All @@ -44,14 +48,14 @@ RUN useradd -u $USER_UID ${USER} -m -g 0 --system && \
chmod g+rx /home/${USER}

## Used as base of the Release stage to removed unrelated the packages and CVEs
FROM base as release-base
FROM base AS release-base

# Removes the python3.9 code to eliminate possible CVEs. Also removes dnf
RUN rpm -e $(dnf repoquery python3-* -q --installed) dnf python3 yum crypto-policies-scripts


## CUDA Base ###################################################################
FROM base as cuda-base
FROM base AS cuda-base

# Ref: https://docs.nvidia.com/cuda/archive/12.1.0/cuda-toolkit-release-notes/
ENV CUDA_VERSION=12.1.0 \
Expand All @@ -75,7 +79,7 @@ ENV CUDA_HOME="/usr/local/cuda" \
LD_LIBRARY_PATH="/usr/local/nvidia/lib:/usr/local/nvidia/lib64:$CUDA_HOME/lib64:$CUDA_HOME/extras/CUPTI/lib64:${LD_LIBRARY_PATH}"

## CUDA Development ############################################################
FROM cuda-base as cuda-devel
FROM cuda-base AS cuda-devel

# Ref: https://developer.nvidia.com/nccl/nccl-legacy-downloads
ENV NV_CUDA_CUDART_DEV_VERSION=12.1.55-1 \
Expand All @@ -99,14 +103,13 @@ RUN dnf config-manager \

ENV LIBRARY_PATH="$CUDA_HOME/lib64/stubs"

FROM cuda-devel as python-installations
FROM cuda-devel AS python-installations

ARG WHEEL_VERSION
ARG USER
ARG USER_UID

## Enable Aimstack if requested via ENABLE_AIM set to "true"
ARG ENABLE_AIM=false
ARG ENABLE_FMS_ACCELERATION
ARG ENABLE_AIM

RUN dnf install -y git && \
# perl-Net-SSLeay.x86_64 and server_key.pem are installed with git as dependencies
Expand All @@ -132,19 +135,32 @@ RUN if [[ -z "${WHEEL_VERSION}" ]]; \
RUN --mount=type=cache,target=/home/${USER}/.cache/pip,uid=${USER_UID} \
python -m pip install --user wheel && \
python -m pip install --user "$(head bdist_name)" && \
python -m pip install --user "$(head bdist_name)[flash-attn]" && \
if [[ "${ENABLE_AIM}" == "true" ]]; then \
python -m pip install --user "$(head bdist_name)[flash-attn]"

# fms_acceleration_peft = PEFT-training, e.g., 4bit QLoRA
# fms_acceleration_foak = Fused LoRA and triton kernels
# fms_acceleration_aadp = Padding-Free Flash Attention Computation
RUN if [[ "${ENABLE_FMS_ACCELERATION}" == "true" ]]; then \
python -m pip install --user "$(head bdist_name)[fms-accel]"; \
python -m fms_acceleration.cli install fms_acceleration_peft; \
python -m fms_acceleration.cli install fms_acceleration_foak; \
python -m fms_acceleration.cli install fms_acceleration_aadp; \
fi

RUN if [[ "${ENABLE_AIM}" == "true" ]]; then \
python -m pip install --user "$(head bdist_name)[aim]"; \
fi && \
fi

# Clean up the wheel module. It's only needed by flash-attn install
python -m pip uninstall wheel build -y && \
RUN python -m pip uninstall wheel build -y && \
# Cleanup the bdist whl file
rm $(head bdist_name) /tmp/bdist_name

## Final image ################################################
FROM release-base as release
FROM release-base AS release
ARG USER
ARG PYTHON_VERSION
ARG ENABLE_AIM

RUN mkdir -p /licenses
COPY LICENSE /licenses/
Expand Down
6 changes: 4 additions & 2 deletions build/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -38,14 +38,16 @@ For example, the below config is used for running with two GPUs and FSDP for fin
"per_device_train_batch_size": 4,
"learning_rate": 1e-5,
"response_template": "\n### Label:",
"dataset_text_field": "output"
"dataset_text_field": "output",
"lora_post_process_for_vllm": true
}
```

Users should always set `num_processes` to be explicit about the number of processes to run tuning on. When `num_processes` is greater than 1, the [FSDP config](https://github.com/foundation-model-stack/fms-hf-tuning/blob/main/fixtures/accelerate_fsdp_defaults.yaml) is used by default. Thus in the above example, you don't need to pass in the FSDP flags since they match the ones used in the default FSDP config. You can also set your own default values by specifying your own config file using key `config_file`. Any of these values in configs can be overwritten by passing in flags via `accelerate_launch_args` in the JSON config.
`num_processes` defaults to the amount of GPUs allocated for tuning, unless the user sets `SET_NUM_PROCESSES_TO_NUM_GPUS` to `False`. When `num_processes` is greater than 1, the [FSDP config](https://github.com/foundation-model-stack/fms-hf-tuning/blob/main/fixtures/accelerate_fsdp_defaults.yaml) is used by default. Thus in the above example, you don't need to pass in the FSDP flags since they match the ones used in the default FSDP config. You can also set your own default values by specifying your own config file using key `config_file`. Any of these values in configs can be overwritten by passing in flags via `accelerate_launch_args` in the JSON config.

Note that `num_processes` which is the total number of processes to be launched in parallel, should match the number of GPUs to run on. The number of GPUs used can also be set by setting environment variable `CUDA_VISIBLE_DEVICES`. If ``num_processes=1`, the script will assume single-GPU.

If tuning for inference on vLLM, set `lora_post_process_for_vllm` to `true`. Post process LoRA adapters to allow inferencing on vLLM. vLLM needs new token embedding weights added during tuning to be moved to a new file new_embeddings.safetensors.

## Building the Image

Expand Down
137 changes: 50 additions & 87 deletions build/accelerate_launch.py
Original file line number Diff line number Diff line change
Expand Up @@ -23,38 +23,29 @@
import subprocess
import sys
import traceback
from pathlib import Path
import json
from pathlib import Path

# Third Party
from accelerate.commands.launch import launch_command
from transformers import AutoModelForCausalLM, AutoTokenizer
from peft import PeftModel
from torch import bfloat16

# Local
from build.utils import (
process_accelerate_launch_args,
get_highest_checkpoint,
)
from tuning.utils.merge_model_utils import (
post_process_vLLM_adapters_new_tokens,
)
from tuning.utils.config_utils import get_json_config
from tuning.utils.error_logging import (
write_termination_log,
USER_ERROR_EXIT_CODE,
INTERNAL_ERROR_EXIT_CODE,
)
from tuning.data import tokenizer_data_utils

ERROR_LOG = "/dev/termination-log"


def get_base_model_from_adapter_config(adapter_config):
"""Given path to adapter_config.json file, returns the base model name"""
with open(adapter_config, "r", encoding="utf-8") as config_file:
adapter_config = json.load(config_file)
return adapter_config.get("base_model_name_or_path")


def main():
if not os.getenv("TERMINATION_LOG_FILE"):
os.environ["TERMINATION_LOG_FILE"] = ERROR_LOG
Expand Down Expand Up @@ -107,6 +98,8 @@ def main():
#
##########
output_dir = job_config.get("output_dir")
if not os.path.exists(output_dir):
os.makedirs(output_dir)
try:
# checkpoints outputted to tempdir, only final checkpoint copied to output dir
launch_command(args)
Expand All @@ -128,85 +121,55 @@ def main():
write_termination_log(f"Unhandled exception during training. {e}")
sys.exit(INTERNAL_ERROR_EXIT_CODE)

# remove lm_head from granite with llama arch models
try:
checkpoint_dir = job_config.get("save_model_dir")
if not checkpoint_dir:
checkpoint_dir = os.path.join(
output_dir, get_highest_checkpoint(output_dir)
)

use_flash_attn = job_config.get("use_flash_attn", True)
adapter_config_path = os.path.join(checkpoint_dir, "adapter_config.json")
tokenizer = AutoTokenizer.from_pretrained(checkpoint_dir)
peft_method = job_config.get("peft_method")

if job_config.get("lora_post_process_for_vllm") and peft_method == "lora":
save_model_dir = job_config.get("save_model_dir")
if save_model_dir:
if os.path.exists(os.path.join(save_model_dir, "added_tokens_info.json")):
with open(
os.path.join(save_model_dir, "added_tokens_info.json"),
encoding="utf-8",
) as json_data:
added_tokens_info = json.load(json_data)
num_added_tokens = added_tokens_info["num_new_tokens"]
else:
logging.warning(
"Failed to post-process: file added_tokens_info.json not in path %s",
save_model_dir,
)

if os.path.exists(adapter_config_path):
base_model_path = get_base_model_from_adapter_config(adapter_config_path)
base_model = AutoModelForCausalLM.from_pretrained(
base_model_path,
attn_implementation="flash_attention_2" if use_flash_attn else None,
torch_dtype=bfloat16 if use_flash_attn else None,
)
if os.path.exists(
os.path.join(save_model_dir, "adapter_model.safetensors")
):
post_process_vLLM_adapters_new_tokens(
save_model_dir, save_model_dir, num_added_tokens
)

# since the peft library (PEFTModelForCausalLM) does not handle cases
# where the model's layers are modified, in our case the embedding layer
# is modified, so we resize the backbone model's embedding layer with our own
# utility before passing it along to load the PEFT model.
tokenizer_data_utils.tokenizer_and_embedding_resize(
{}, tokenizer=tokenizer, model=base_model
)
model = PeftModel.from_pretrained(
base_model,
checkpoint_dir,
attn_implementation="flash_attention_2" if use_flash_attn else None,
torch_dtype=bfloat16 if use_flash_attn else None,
)
if (
os.path.exists(os.path.join(output_dir, "added_tokens_info.json"))
and job_config.get("save_strategy") != "no"
):
with open(
os.path.join(output_dir, "added_tokens_info.json"), encoding="utf-8"
) as json_data:
added_tokens_info = json.load(json_data)
num_added_tokens = added_tokens_info["num_new_tokens"]
# if multiple checkpoints in directory, process each checkpoint
for _, dirs, _ in os.walk(output_dir, topdown=False):
for name in dirs:
if "checkpoint-" in name.lower():
post_process_vLLM_adapters_new_tokens(
os.path.join(output_dir, name),
os.path.join(output_dir, name),
num_added_tokens,
)
else:
model = AutoModelForCausalLM.from_pretrained(
checkpoint_dir,
attn_implementation="flash_attention_2" if use_flash_attn else None,
torch_dtype=bfloat16 if use_flash_attn else None,
logging.warning(
"Failed to post-process: file added_tokens_info.json not in path %s",
save_model_dir,
)

model_arch = model.config.model_type
# check that it is a granite model with llama architecture with tied weights
# ie. lm_head is duplicate of embeddings

# a fine tuned model will have params_dict.get("model.embed_tokens.weight")
# a prompt adapter has params_dict.get("base_model.model.embed_tokens.weight")
# a lora adapter has params_dict.get("base_model.model.model.embed_tokens.weight")
if model_arch == "llama" and hasattr(model, "lm_head"):
if (
# lora tuned model has an addt model layer
(
hasattr(model.model, "model")
and model.lm_head.weight.untyped_storage().data_ptr()
== model.model.model.embed_tokens.weight.untyped_storage().data_ptr()
)
# prompt tuned model or fine tuned model
or (
hasattr(model.model, "embed_tokens")
and model.lm_head.weight.untyped_storage().data_ptr()
== model.model.embed_tokens.weight.untyped_storage().data_ptr()
)
):

logging.info("Removing lm_head from checkpoint")
del model.lm_head.weight

if hasattr(model, "lm_head.weight"):
logging.warning("Failed to delete lm_head.weight from model")

logging.info("Saving checkpoint to %s", output_dir)
model.save_pretrained(checkpoint_dir)
# save tokenizer with model
tokenizer.save_pretrained(checkpoint_dir)

except Exception as e: # pylint: disable=broad-except
logging.error(traceback.format_exc())
write_termination_log(f"Exception encountered removing lm_head from model: {e}")
sys.exit(INTERNAL_ERROR_EXIT_CODE)

# The .complete file will signal to users that we are finished copying
# files over
if os.path.exists(output_dir):
Expand Down
6 changes: 5 additions & 1 deletion build/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -24,12 +24,16 @@
import shutil


def copy_checkpoint(source, destination):
def copy_checkpoint(source, destination, exclude_files: list[str] = None):
if not os.path.exists(destination):
os.makedirs(destination)
shutil.copystat(source, destination)
# Have a list of directory objects, now iterate over them.
if exclude_files is None:
exclude_files = []
for item in os.listdir(source):
if item in exclude_files:
continue
source_file = os.path.join(source, item)
destination_file = os.path.join(destination, item)
if os.path.isdir(source_file):
Expand Down
6 changes: 3 additions & 3 deletions pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -27,8 +27,8 @@ classifiers=[
]
dependencies = [
"numpy>=1.26.4,<2.0",
"accelerate==0.33",
"transformers>4.41,<5.0",
"accelerate>=0.20.3,<0.34",
"transformers>4.41,<4.45",
"torch>=2.2.0,<3.0",
"sentencepiece>=0.1.99,<0.3",
"tokenizers>=0.13.3,<1.0",
Expand All @@ -37,7 +37,6 @@ dependencies = [
"peft>=0.8.0,<0.13",
"protobuf>=5.28.0,<6.0.0",
"datasets>=2.15.0,<3.0",
"fire>=0.5.0,<1.0",
"simpleeval>=0.9.13,<1.0",
]

Expand All @@ -46,6 +45,7 @@ dev = ["wheel>=0.42.0,<1.0", "packaging>=23.2,<25", "ninja>=1.11.1.1,<2.0", "sci
flash-attn = ["flash-attn>=2.5.3,<3.0"]
aim = ["aim>=3.19.0,<4.0"]
fms-accel = ["fms-acceleration>=0.1"]
gptq-dev = ["auto_gptq>0.4.2", "optimum>=1.15.0"]


[tool.setuptools.packages.find]
Expand Down
Loading

0 comments on commit 3b150ab

Please sign in to comment.