Merge pull request #357 from foundation-model-stack/v2.0.0-rc2

release: merge set of changes for v2.0.0
foundation-model-stack · Sep 30, 2024 · 3b150ab · 3b150ab
2 parents 16543ee + a03e58f
commit 3b150ab
Show file tree

Hide file tree

Showing 45 changed files with 99,317 additions and 348 deletions.
diff --git a/.github/workflows/labelpr.yaml b/.github/workflows/labelpr.yaml
@@ -0,0 +1,35 @@
+name: Label PRs
+
+on:
+  pull_request_target:
+    types: [opened, edited, synchronize, reopened]
+
+jobs:
+  label_pr:
+    runs-on: ubuntu-latest
+    steps:
+      - uses: actions/github-script@v3
+        with:
+          github-token: ${{ secrets.GITHUB_TOKEN }}
+          script: |
+            const pr_welcome_msg = `Thanks for making a pull request! 😃\nOne of the maintainers will review and advise on the next steps.`;
+            // https://github.com/commitizen/conventional-commit-types
+            const valid_pr_types = ['feat', 'fix', 'docs', 'style', 'refactor', 'perf', 'test', 'build', 'ci', 'chore', 'revert'];
+
+            if(context.payload.pull_request.comments === 0) {
+              await github.issues.createComment({ ...context.repo, issue_number: context.payload.number, body: pr_welcome_msg});
+            }
+
+            const title = context.payload.pull_request.title;
+            const results = /^(\w+)(\(\w+\))?!?:/.exec(title);
+            if (results === null) return core.setFailed(`The title does not follow conventional commits spec: https://www.conventionalcommits.org/en/v1.0.0/#summary Title: ${title}`);
+
+            const pr_type = results[1];
+            core.info(`pr_type: ${pr_type}`);
+
+            if (!valid_pr_types.includes(pr_type)) return core.setFailed(`Unknown pull request type: ${pr_type}`);
+
+            const labels = context.payload.pull_request.labels;
+            const new_labels = labels.filter(label => !valid_pr_types.includes(label.name)); // keep all labels that are not in valid_pr_types
+            new_labels.push({name: pr_type});
+            await github.issues.update({ ...context.repo, issue_number: context.payload.number, labels: new_labels });
diff --git a/.pylintrc b/.pylintrc
@@ -333,7 +333,7 @@ indent-string='    '
 max-line-length=100
 
 # Maximum number of lines in a module.
-max-module-lines=1100
+max-module-lines=1200
 
 # Allow the body of a class to be on the same line as the declaration if body
 # contains single statement.

diff --git a/README.md b/README.md
diff --git a/build/Dockerfile b/build/Dockerfile
@@ -19,17 +19,21 @@ ARG USER=tuning
 ARG USER_UID=1000
 ARG PYTHON_VERSION=3.11
 ARG WHEEL_VERSION=""
+## Enable Aimstack if requested via ENABLE_AIM set to "true"
+ARG ENABLE_AIM=false
+ARG ENABLE_FMS_ACCELERATION=true
 
 ## Base Layer ##################################################################
-FROM registry.access.redhat.com/ubi9/ubi:${BASE_UBI_IMAGE_TAG} as base
+FROM registry.access.redhat.com/ubi9/ubi:${BASE_UBI_IMAGE_TAG} AS base
 
 ARG PYTHON_VERSION
 ARG USER
 ARG USER_UID
 
+# Note this works for 3.9, 3.11, 3.12
 RUN dnf remove -y --disableplugin=subscription-manager \
         subscription-manager \
-    && dnf install -y python${PYTHON_VERSION} procps \
+    && dnf install -y python${PYTHON_VERSION} procps g++ python${PYTHON_VERSION}-devel \
     && ln -s /usr/bin/python${PYTHON_VERSION} /bin/python \
     && python -m ensurepip --upgrade \
     && python -m pip install --upgrade pip \
@@ -44,14 +48,14 @@ RUN useradd -u $USER_UID ${USER} -m -g 0 --system && \
     chmod g+rx /home/${USER}
 
 ## Used as base of the Release stage to removed unrelated the packages and CVEs
-FROM base as release-base
+FROM base AS release-base
 
 # Removes the python3.9 code to eliminate possible CVEs.  Also removes dnf
 RUN rpm -e $(dnf repoquery python3-* -q --installed) dnf python3 yum crypto-policies-scripts
 
 
 ## CUDA Base ###################################################################
-FROM base as cuda-base
+FROM base AS cuda-base
 
 # Ref: https://docs.nvidia.com/cuda/archive/12.1.0/cuda-toolkit-release-notes/
 ENV CUDA_VERSION=12.1.0 \
@@ -75,7 +79,7 @@ ENV CUDA_HOME="/usr/local/cuda" \
     LD_LIBRARY_PATH="/usr/local/nvidia/lib:/usr/local/nvidia/lib64:$CUDA_HOME/lib64:$CUDA_HOME/extras/CUPTI/lib64:${LD_LIBRARY_PATH}"
 
 ## CUDA Development ############################################################
-FROM cuda-base as cuda-devel
+FROM cuda-base AS cuda-devel
 
 # Ref: https://developer.nvidia.com/nccl/nccl-legacy-downloads
 ENV NV_CUDA_CUDART_DEV_VERSION=12.1.55-1 \
@@ -99,14 +103,13 @@ RUN dnf config-manager \
 
 ENV LIBRARY_PATH="$CUDA_HOME/lib64/stubs"
 
-FROM cuda-devel as python-installations
+FROM cuda-devel AS python-installations
 
 ARG WHEEL_VERSION
 ARG USER
 ARG USER_UID
-
-## Enable Aimstack if requested via ENABLE_AIM set to "true"
-ARG ENABLE_AIM=false
+ARG ENABLE_FMS_ACCELERATION
+ARG ENABLE_AIM
 
 RUN dnf install -y git && \
     # perl-Net-SSLeay.x86_64 and server_key.pem are installed with git as dependencies
@@ -132,19 +135,32 @@ RUN if [[ -z "${WHEEL_VERSION}" ]]; \
 RUN --mount=type=cache,target=/home/${USER}/.cache/pip,uid=${USER_UID} \
     python -m pip install --user wheel && \
     python -m pip install --user "$(head bdist_name)" && \
-    python -m pip install --user "$(head bdist_name)[flash-attn]" && \
-    if [[ "${ENABLE_AIM}" == "true" ]]; then \
+    python -m pip install --user "$(head bdist_name)[flash-attn]"
+
+# fms_acceleration_peft = PEFT-training, e.g., 4bit QLoRA
+# fms_acceleration_foak = Fused LoRA and triton kernels
+# fms_acceleration_aadp = Padding-Free Flash Attention Computation
+RUN if [[ "${ENABLE_FMS_ACCELERATION}" == "true" ]]; then \
+        python -m pip install --user "$(head bdist_name)[fms-accel]"; \
+        python -m fms_acceleration.cli install fms_acceleration_peft; \
+        python -m fms_acceleration.cli install fms_acceleration_foak; \
+        python -m fms_acceleration.cli install fms_acceleration_aadp; \
+    fi
+
+RUN if [[ "${ENABLE_AIM}" == "true" ]]; then \
         python -m pip install --user "$(head bdist_name)[aim]"; \
-    fi && \
+    fi
+
     # Clean up the wheel module. It's only needed by flash-attn install
-    python -m pip uninstall wheel build -y && \
+RUN python -m pip uninstall wheel build -y && \
     # Cleanup the bdist whl file
     rm $(head bdist_name) /tmp/bdist_name
 
 ## Final image ################################################
-FROM release-base as release
+FROM release-base AS release
 ARG USER
 ARG PYTHON_VERSION
+ARG ENABLE_AIM
 
 RUN mkdir -p /licenses
 COPY LICENSE /licenses/

diff --git a/build/README.md b/build/README.md
@@ -38,14 +38,16 @@ For example, the below config is used for running with two GPUs and FSDP for fin
     "per_device_train_batch_size": 4,
     "learning_rate": 1e-5,
     "response_template": "\n### Label:",
-    "dataset_text_field": "output"
+    "dataset_text_field": "output",
+    "lora_post_process_for_vllm": true
 }
 ```
 
-Users should always set `num_processes` to be explicit about the number of processes to run tuning on. When `num_processes` is greater than 1, the [FSDP config](https://github.com/foundation-model-stack/fms-hf-tuning/blob/main/fixtures/accelerate_fsdp_defaults.yaml) is used by default. Thus in the above example, you don't need to pass in the FSDP flags since they match the ones used in the default FSDP config. You can also set your own default values by specifying your own config file using key `config_file`. Any of these values in configs can be overwritten by passing in flags via `accelerate_launch_args` in the JSON config.
+`num_processes` defaults to the amount of GPUs allocated for tuning, unless the user sets `SET_NUM_PROCESSES_TO_NUM_GPUS` to `False`. When `num_processes` is greater than 1, the [FSDP config](https://github.com/foundation-model-stack/fms-hf-tuning/blob/main/fixtures/accelerate_fsdp_defaults.yaml) is used by default. Thus in the above example, you don't need to pass in the FSDP flags since they match the ones used in the default FSDP config. You can also set your own default values by specifying your own config file using key `config_file`. Any of these values in configs can be overwritten by passing in flags via `accelerate_launch_args` in the JSON config.
 
 Note that `num_processes` which is the total number of processes to be launched in parallel, should match the number of GPUs to run on. The number of GPUs used can also be set by setting environment variable `CUDA_VISIBLE_DEVICES`. If ``num_processes=1`, the script will assume single-GPU.
 
+If tuning for inference on vLLM, set `lora_post_process_for_vllm` to `true`. Post process LoRA adapters to allow inferencing on vLLM. vLLM needs new token embedding weights added during tuning to be moved to a new file new_embeddings.safetensors.
 
 ## Building the Image
 

diff --git a/build/accelerate_launch.py b/build/accelerate_launch.py
@@ -23,38 +23,29 @@
 import subprocess
 import sys
 import traceback
-from pathlib import Path
 import json
+from pathlib import Path
 
 # Third Party
 from accelerate.commands.launch import launch_command
-from transformers import AutoModelForCausalLM, AutoTokenizer
-from peft import PeftModel
-from torch import bfloat16
 
 # Local
 from build.utils import (
     process_accelerate_launch_args,
-    get_highest_checkpoint,
+)
+from tuning.utils.merge_model_utils import (
+    post_process_vLLM_adapters_new_tokens,
 )
 from tuning.utils.config_utils import get_json_config
 from tuning.utils.error_logging import (
     write_termination_log,
     USER_ERROR_EXIT_CODE,
     INTERNAL_ERROR_EXIT_CODE,
 )
-from tuning.data import tokenizer_data_utils
 
 ERROR_LOG = "/dev/termination-log"
 
 
-def get_base_model_from_adapter_config(adapter_config):
-    """Given path to adapter_config.json file, returns the base model name"""
-    with open(adapter_config, "r", encoding="utf-8") as config_file:
-        adapter_config = json.load(config_file)
-        return adapter_config.get("base_model_name_or_path")
-
-
 def main():
     if not os.getenv("TERMINATION_LOG_FILE"):
         os.environ["TERMINATION_LOG_FILE"] = ERROR_LOG
@@ -107,6 +98,8 @@ def main():
     #
     ##########
     output_dir = job_config.get("output_dir")
+    if not os.path.exists(output_dir):
+        os.makedirs(output_dir)
     try:
         # checkpoints outputted to tempdir, only final checkpoint copied to output dir
         launch_command(args)
@@ -128,85 +121,55 @@ def main():
         write_termination_log(f"Unhandled exception during training. {e}")
         sys.exit(INTERNAL_ERROR_EXIT_CODE)
 
-    # remove lm_head from granite with llama arch models
-    try:
-        checkpoint_dir = job_config.get("save_model_dir")
-        if not checkpoint_dir:
-            checkpoint_dir = os.path.join(
-                output_dir, get_highest_checkpoint(output_dir)
-            )
-
-        use_flash_attn = job_config.get("use_flash_attn", True)
-        adapter_config_path = os.path.join(checkpoint_dir, "adapter_config.json")
-        tokenizer = AutoTokenizer.from_pretrained(checkpoint_dir)
+    peft_method = job_config.get("peft_method")
+
+    if job_config.get("lora_post_process_for_vllm") and peft_method == "lora":
+        save_model_dir = job_config.get("save_model_dir")
+        if save_model_dir:
+            if os.path.exists(os.path.join(save_model_dir, "added_tokens_info.json")):
+                with open(
+                    os.path.join(save_model_dir, "added_tokens_info.json"),
+                    encoding="utf-8",
+                ) as json_data:
+                    added_tokens_info = json.load(json_data)
+                    num_added_tokens = added_tokens_info["num_new_tokens"]
+            else:
+                logging.warning(
+                    "Failed to post-process: file added_tokens_info.json not in path %s",
+                    save_model_dir,
+                )
 
-        if os.path.exists(adapter_config_path):
-            base_model_path = get_base_model_from_adapter_config(adapter_config_path)
-            base_model = AutoModelForCausalLM.from_pretrained(
-                base_model_path,
-                attn_implementation="flash_attention_2" if use_flash_attn else None,
-                torch_dtype=bfloat16 if use_flash_attn else None,
-            )
+            if os.path.exists(
+                os.path.join(save_model_dir, "adapter_model.safetensors")
+            ):
+                post_process_vLLM_adapters_new_tokens(
+                    save_model_dir, save_model_dir, num_added_tokens
+                )
 
-            # since the peft library (PEFTModelForCausalLM) does not handle cases
-            # where the model's layers are modified, in our case the embedding layer
-            # is modified, so we resize the backbone model's embedding layer with our own
-            # utility before passing it along to load the PEFT model.
-            tokenizer_data_utils.tokenizer_and_embedding_resize(
-                {}, tokenizer=tokenizer, model=base_model
-            )
-            model = PeftModel.from_pretrained(
-                base_model,
-                checkpoint_dir,
-                attn_implementation="flash_attention_2" if use_flash_attn else None,
-                torch_dtype=bfloat16 if use_flash_attn else None,
-            )
+        if (
+            os.path.exists(os.path.join(output_dir, "added_tokens_info.json"))
+            and job_config.get("save_strategy") != "no"
+        ):
+            with open(
+                os.path.join(output_dir, "added_tokens_info.json"), encoding="utf-8"
+            ) as json_data:
+                added_tokens_info = json.load(json_data)
+                num_added_tokens = added_tokens_info["num_new_tokens"]
+            # if multiple checkpoints in directory, process each checkpoint
+            for _, dirs, _ in os.walk(output_dir, topdown=False):
+                for name in dirs:
+                    if "checkpoint-" in name.lower():
+                        post_process_vLLM_adapters_new_tokens(
+                            os.path.join(output_dir, name),
+                            os.path.join(output_dir, name),
+                            num_added_tokens,
+                        )
         else:
-            model = AutoModelForCausalLM.from_pretrained(
-                checkpoint_dir,
-                attn_implementation="flash_attention_2" if use_flash_attn else None,
-                torch_dtype=bfloat16 if use_flash_attn else None,
+            logging.warning(
+                "Failed to post-process: file added_tokens_info.json not in path %s",
+                save_model_dir,
             )
 
-        model_arch = model.config.model_type
-        # check that it is a granite model with llama architecture with tied weights
-        # ie. lm_head is duplicate of embeddings
-
-        # a fine tuned model will have params_dict.get("model.embed_tokens.weight")
-        # a prompt adapter has params_dict.get("base_model.model.embed_tokens.weight")
-        # a lora adapter has params_dict.get("base_model.model.model.embed_tokens.weight")
-        if model_arch == "llama" and hasattr(model, "lm_head"):
-            if (
-                # lora tuned model has an addt model layer
-                (
-                    hasattr(model.model, "model")
-                    and model.lm_head.weight.untyped_storage().data_ptr()
-                    == model.model.model.embed_tokens.weight.untyped_storage().data_ptr()
-                )
-                # prompt tuned model or fine tuned model
-                or (
-                    hasattr(model.model, "embed_tokens")
-                    and model.lm_head.weight.untyped_storage().data_ptr()
-                    == model.model.embed_tokens.weight.untyped_storage().data_ptr()
-                )
-            ):
-
-                logging.info("Removing lm_head from checkpoint")
-                del model.lm_head.weight
-
-                if hasattr(model, "lm_head.weight"):
-                    logging.warning("Failed to delete lm_head.weight from model")
-
-                logging.info("Saving checkpoint to %s", output_dir)
-                model.save_pretrained(checkpoint_dir)
-                # save tokenizer with model
-                tokenizer.save_pretrained(checkpoint_dir)
-
-    except Exception as e:  # pylint: disable=broad-except
-        logging.error(traceback.format_exc())
-        write_termination_log(f"Exception encountered removing lm_head from model: {e}")
-        sys.exit(INTERNAL_ERROR_EXIT_CODE)
-
     # The .complete file will signal to users that we are finished copying
     # files over
     if os.path.exists(output_dir):

diff --git a/build/utils.py b/build/utils.py
@@ -24,12 +24,16 @@
 import shutil
 
 
-def copy_checkpoint(source, destination):
+def copy_checkpoint(source, destination, exclude_files: list[str] = None):
     if not os.path.exists(destination):
         os.makedirs(destination)
         shutil.copystat(source, destination)
     # Have a list of directory objects, now iterate over them.
+    if exclude_files is None:
+        exclude_files = []
     for item in os.listdir(source):
+        if item in exclude_files:
+            continue
         source_file = os.path.join(source, item)
         destination_file = os.path.join(destination, item)
         if os.path.isdir(source_file):

diff --git a/pyproject.toml b/pyproject.toml
@@ -27,8 +27,8 @@ classifiers=[
 ]
 dependencies = [
 "numpy>=1.26.4,<2.0",
-"accelerate==0.33",
-"transformers>4.41,<5.0",
+"accelerate>=0.20.3,<0.34",
+"transformers>4.41,<4.45",
 "torch>=2.2.0,<3.0",
 "sentencepiece>=0.1.99,<0.3",
 "tokenizers>=0.13.3,<1.0",
@@ -37,7 +37,6 @@ dependencies = [
 "peft>=0.8.0,<0.13",
 "protobuf>=5.28.0,<6.0.0",
 "datasets>=2.15.0,<3.0",
-"fire>=0.5.0,<1.0",
 "simpleeval>=0.9.13,<1.0",
 ]
 
@@ -46,6 +45,7 @@ dev = ["wheel>=0.42.0,<1.0", "packaging>=23.2,<25", "ninja>=1.11.1.1,<2.0", "sci
 flash-attn = ["flash-attn>=2.5.3,<3.0"]
 aim = ["aim>=3.19.0,<4.0"]
 fms-accel = ["fms-acceleration>=0.1"]
+gptq-dev = ["auto_gptq>0.4.2", "optimum>=1.15.0"]
 
 
 [tool.setuptools.packages.find]