NVIDIA · ko3n1g · Nov 22, 2024 · Dec 5, 2024 · Dec 6, 2024 · Dec 7, 2024
diff --git a/.github/workflows/_run_test.yml b/.github/workflows/_run_test.yml
@@ -57,6 +57,7 @@ jobs:
               --env HF_HOME=/home/TestData/aligner/hf_home \
               --env ALIGNER_CI_DIR=/home/TestData/aligner \
               --env ALIGNER_REPO_DIR=/opt/NeMo-Aligner \
+              --volume /mnt/datadrive/TestData/aligner/nlp-copy:/home/TestData/aligner/nlp-copy \
               --volume /mnt/datadrive/TestData/aligner/checkpoints:/home/TestData/aligner/checkpoints:ro \
               --volume /mnt/datadrive/TestData/aligner/hf_home/hub:/home/TestData/aligner/hf_home/hub:ro \
               nemoci.azurecr.io/nemo_aligner_container:${{ github.run_id }} \

diff --git a/.github/workflows/cicd-main.yml b/.github/workflows/cicd-main.yml
@@ -133,11 +133,16 @@ jobs:
           - dpo-llama3-pack
           - kd-llama3
           - sft-llama3
+          - sft-llama3-cp
           - rm-llama3
+          - e2e-nemo2
+          # TODO: Uncomment after mcore upgraded to ToT to fix hang and !2532 merged to fix crash
+          #- dpo-mixtral-ep
+          - dpo-mixtral-peft-tp-sp
     with:
       RUNNER: self-hosted-azure
       # Fairly aggresive timeout that all functional tests should try to adhere to
-      TIMEOUT: 8
+      TIMEOUT: 10
       SCRIPT: |
         bash /opt/NeMo-Aligner/tests/functional/test_cases/${{ matrix.test_case }}
 

diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -17,7 +17,22 @@ The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/)
 
 ## [Next Version]
 
+## NVIDIA NeMo-Aligner 0.6.0
+
 ### New Features and Optimizations
+- Added context parallel (CP) support for SFT. CP requires you to prepare your dataset using NeMo's [prepare_packed_ft_dataset.py](https://github.com/NVIDIA/NeMo/blob/main/scripts/nlp_language_modeling/prepare_packed_ft_dataset.py) script prior to training. Be sure to pass the context parallel size to this script, for example:
+
+   ```
+   python scripts/nlp_language_modeling/prepare_packed_ft_dataset.py \
+      model.data.train_ds.file_names=[/path/to/training.jsonl] \
+      model.data.train_ds.max_seq_length=2048 \
+      +tokenizer_path=/path/to/tokenizer \
+      +output_dir=/path/to/output_folder \
+      +pack_sizes=[2048,4096,8192] \
+      model.context_parallel_size=2
+   ```
+  CP can then be enabled in your training run by setting `model.context_parallel_size` in your config. Refer to the [SFT documentation](https://github.com/NVIDIA/NeMo-Aligner/blob/main/docs/user-guide/sft.rst#step-1-format-the-data)
+for more details on running `prepare_packed_ft_dataset.py` and on running SFT with a packed dataset.
 - Sequence packing is now supported when running DPO.
 - Added support for Knowledge Distillation with SFT. See the [tutorial](docs/user-guide/knowledge-distillation.rst) for details.
 - Added support for Megatron Core’s distributed optimizer, which can be configured using `++model.optim.name=mcore_distributed_optim`.

diff --git a/Dockerfile b/Dockerfile
@@ -13,8 +13,8 @@ ARG MAX_JOBS=8
 # Git refs for dependencies
 ARG TE_TAG=7d576ed25266a17a7b651f2c12e8498f67e0baea
 ARG PYTRITON_VERSION=0.5.10
-ARG NEMO_TAG=19668e5320a2e2af0199b6d5e0b841993be3a634  # On: main
-ARG MLM_TAG=25059d3bbf68be0751800f3644731df12a88f3f3   # On: main
+ARG NEMO_TAG=633cb602777bffefbe12066b0c915c87e7b469e9 # On: v2.1.0
+ARG MLM_TAG=d15cec53beb283e7127b7d594e1c46b8a0719b6d  # On: core_r0.10.0
 ARG ALIGNER_COMMIT=main
 ARG TRTLLM_VERSION=v0.13.0
 ARG PROTOBUF_VERSION=4.24.4
@@ -107,6 +107,10 @@ RUN git clone https://github.com/NVIDIA/NeMo.git && \
     pip install -e ".[nlp]" && \
     cd nemo/collections/nlp/data/language_modeling/megatron && make
 
+# TODO: Allow installing from the default branch, but introduce a build
+#  arg if compatibility starts breaking
+RUN pip install --no-cache-dir git+https://github.com/NVIDIA/NeMo-Run.git
+
 # TODO: While we are on Pytorch 24.07, we need to downgrade triton since 3.2.0 introduced a breaking change
 #   This un-pinned requirement comes from mamba-ssm, and this pin can be removed once Pytorch base image is
 #   updated.
@@ -124,25 +128,29 @@ RUN pip uninstall -y megatron-core && \
     fi && \
     pip install -e .
 
+# TODO: This is redundant since NeMo installs this as of 24.12, but keep
+#  it until 25.03 to give folks enough time to transition.
+RUN pip install --no-cache-dir lightning
+
 COPY --from=aligner-bump /opt/NeMo-Aligner /opt/NeMo-Aligner
 RUN cd /opt/NeMo-Aligner && \
     pip install --no-deps -e .
 
 RUN cd TensorRT-LLM && patch -p1 < ../NeMo-Aligner/setup/trtllm.patch
 
-# TODO(terryk): This layer should be deleted ASAP after NeMo is bumped to include all of these PRs
+# NOTE: Comment this layer out if it is not needed
+# NOTE: This section exists to allow cherry-picking PRs in cases where
+#  we do not wish to simply update to the top-of-tree. Sometimes PRs
+#  cannot be cherry-picked cleanly if rebased a few times to top-of-tree
+#  so this logic also requires you to select a SHA (can be dangling) from
+#  the PR.
 RUN <<"EOF" bash -exu
 cd NeMo
 # Ensures we don't cherry-pick "future" origin/main commits
 git fetch -a
-# 0c92fe17df4642ffc33d5d8c0c83fda729e3910c: [fix] Ensures disabling exp_manager with exp_manager=null does not error NeMo#10651
-# 60e677423667c029dd05875da72bf0719774f844: [feat] Update get_model_parallel_src_rank to support tp-pp-dp ordering NeMo#10652
-# 0deaf6716cb4f20766c995ce25d129795f1ae200: fix[export]: update API for disabling device reassignment in TRTLLM for Aligner NeMo#10863
-# (superceded by 10863) 148543d6e9c66ff1f8562e84484448202249811d: feat: Migrate GPTSession refit path in Nemo export to ModelRunner for Aligner NeMo#10654
+# d27dd28b4186f6ecd9f46f1c5679a5eef9bad14e: fix: export weight name mapping if model is nemo model#11497
 for pr_and_commit in \
-  "10651 0c92fe17df4642ffc33d5d8c0c83fda729e3910c" \
-  "10652 60e677423667c029dd05875da72bf0719774f844" \
-  "10863 0deaf6716cb4f20766c995ce25d129795f1ae200" \
+  "11497 d27dd28b4186f6ecd9f46f1c5679a5eef9bad14e" \
 ; do
   pr=$(cut -f1 -d' ' <<<"$pr_and_commit")
   head_pr_commit=$(cut -f2 -d' ' <<<"$pr_and_commit")

diff --git a/conftest.py b/conftest.py
@@ -15,8 +15,8 @@
 import os
 
 import pytest
+from lightning.pytorch import Trainer
 from omegaconf import DictConfig
-from pytorch_lightning import Trainer
 
 from nemo.collections.common.tokenizers.huggingface.auto_tokenizer import AutoTokenizer
 from nemo.collections.nlp.models.language_modeling.megatron_gpt_model import MegatronGPTModel
@@ -415,6 +415,8 @@ def pytest_sessionfinish(session, exitstatus):
     if torch.distributed.is_initialized():
         torch.distributed.destroy_process_group()
 
-    if exitstatus == 0:
+    if exitstatus == 0 and (
+        os.environ.get("LOCAL_RANK", None) == "0" or os.environ.get("OMPI_COMM_WORLD_LOCAL_RANK", None) == "0"
+    ):
         with open(SUCCESS_FILE, "w") as f:
             ...
diff --git a/examples/nlp/gpt/conf/gpt_dpo.yaml b/examples/nlp/gpt/conf/gpt_dpo.yaml
@@ -6,6 +6,7 @@ trainer:
   devices: 8
   accelerator: gpu
   precision: bf16
+  gradient_clip_val: ${trainer.dpo.gradient_clip_val}  # No need to change. Megatron Core optimizer uses this value
 
   # dpo specific args
   dpo:
@@ -17,6 +18,7 @@ trainer:
 
     # how many GBS we loop over
     limit_val_batches: 1.0
+    # TODO: delete once Megatron Core optimizer becomes default
     gradient_clip_val: 1.0
 
   # do not change these
@@ -98,6 +100,7 @@ model:
     name: distributed_fused_adam
     bucket_cap_mb: 200
     overlap_grad_sync: False
+    overlap_param_sync: False
     contiguous_grad_buffer: True
     lr: 9e-6
     weight_decay: 0.1 

diff --git a/examples/nlp/gpt/conf/gpt_knowledge_distillation.yaml b/examples/nlp/gpt/conf/gpt_knowledge_distillation.yaml
@@ -5,6 +5,7 @@ trainer:
   devices: 1
   accelerator: gpu
   precision: bf16
+  gradient_clip_val: ${trainer.knowledge_distillation.gradient_clip_val}  # No need to change. Megatron Core optimizer uses this value
 
   knowledge_distillation:
     max_epochs: 1

diff --git a/examples/nlp/gpt/conf/gpt_kto.yaml b/examples/nlp/gpt/conf/gpt_kto.yaml
@@ -6,6 +6,7 @@ trainer:
   devices: 8
   accelerator: gpu
   precision: bf16
+  gradient_clip_val: ${trainer.kto.gradient_clip_val}  # No need to change. Megatron Core optimizer uses this value
 
   # kto specific args
   kto:
@@ -17,6 +18,7 @@ trainer:
 
     # how many GBS we loop over
     limit_val_batches: 1.0
+    # TODO: delete once Megatron Core optimizer becomes default
     gradient_clip_val: 1.0
 
   # do not change these
@@ -98,6 +100,7 @@ model:
     name: distributed_fused_adam
     bucket_cap_mb: 200
     overlap_grad_sync: False
+    overlap_param_sync: False
     contiguous_grad_buffer: True
     lr: 9e-6
     weight_decay: 0.1 

diff --git a/examples/nlp/gpt/conf/gpt_ppo_actor.yaml b/examples/nlp/gpt/conf/gpt_ppo_actor.yaml
@@ -7,6 +7,7 @@ trainer:
   devices: 8
   accelerator: gpu
   precision: bf16
+  gradient_clip_val: ${trainer.ppo.gradient_clip_val}  # No need to change. Megatron Core optimizer uses this value
 
   ppo:
     # How many steps we train warmup the critic for (without training the policy)
@@ -21,6 +22,7 @@ trainer:
     max_steps: -1  # max PPO steps (-1 to go through the whole train set)
     val_check_interval: 10
     save_interval: ${.val_check_interval}
+    # TODO: delete once Megatron Core optimizer becomes default
     gradient_clip_val: 1.0
 
     # PPO args to generate the data for training
@@ -213,6 +215,7 @@ model:
     name: distributed_fused_adam
     bucket_cap_mb: 200
     overlap_grad_sync: False
+    overlap_param_sync: False
     contiguous_grad_buffer: True
     lr: 9e-7
     weight_decay: 0.1

diff --git a/examples/nlp/gpt/conf/gpt_ppo_critic.yaml b/examples/nlp/gpt/conf/gpt_ppo_critic.yaml
@@ -6,6 +6,7 @@ trainer:
   devices: 8
   accelerator: gpu
   precision: bf16
+  gradient_clip_val: ${trainer.ppo.gradient_clip_val}  # No need to change. Megatron Core optimizer uses this value
 
   ppo:
     port: 5556
@@ -15,6 +16,7 @@ trainer:
 
     # used to set the learning rate scheduler
     max_steps: 10000
+    # TODO: delete once Megatron Core optimizer becomes default
     gradient_clip_val: 1.0
 
     # a PyTriton parameter to specify
@@ -109,6 +111,7 @@ model:
     name: distributed_fused_adam
     bucket_cap_mb: 200
     overlap_grad_sync: False
+    overlap_param_sync: False
     contiguous_grad_buffer: True
     lr: 9e-6
     weight_decay: 0.1 

diff --git a/examples/nlp/gpt/conf/gpt_reinforce_actor.yaml b/examples/nlp/gpt/conf/gpt_reinforce_actor.yaml
@@ -7,6 +7,7 @@ trainer:
   devices: 8
   accelerator: gpu
   precision: bf16
+  gradient_clip_val: ${trainer.reinforce.gradient_clip_val}  # No need to change. Megatron Core optimizer uses this value
 
   reinforce:
 

diff --git a/examples/nlp/gpt/conf/gpt_rs_actor.yaml b/examples/nlp/gpt/conf/gpt_rs_actor.yaml
@@ -7,12 +7,14 @@ trainer:
   devices: 8
   accelerator: gpu
   precision: bf16
+  gradient_clip_val: ${trainer.rs.gradient_clip_val}  # No need to change. Megatron Core optimizer uses this value
 
   rs:
     max_epochs: 1
     max_steps: -1  # max rs steps (-1 to go through the whole train set)
     val_check_interval: 10
     save_interval: ${.val_check_interval}
+    # TODO: delete once Megatron Core optimizer becomes default
     gradient_clip_val: 1.0
 
     # pick up from the model
@@ -147,6 +149,7 @@ model:
     name: distributed_fused_adam
     bucket_cap_mb: 200
     overlap_grad_sync: False
+    overlap_param_sync: False
     contiguous_grad_buffer: True
     lr: 9e-7
     weight_decay: 0.1
@@ -177,4 +180,4 @@ model:
   # define fields from the base model's config that should be ignored when merging with this config.
   overwrite_base_config:
     data:
-      data_prefix: True
+      data_prefix: True
diff --git a/examples/nlp/gpt/conf/gpt_sft.yaml b/examples/nlp/gpt/conf/gpt_sft.yaml
@@ -5,6 +5,7 @@ trainer:
   devices: 1
   accelerator: gpu
   precision: bf16
+  gradient_clip_val: ${trainer.sft.gradient_clip_val}  # No need to change. Megatron Core optimizer uses this value
 
   sft:
     max_epochs: 1
@@ -15,6 +16,7 @@ trainer:
     limit_train_batches: 1.0
 
     limit_val_batches: 1.0
+    # TODO: delete once Megatron Core optimizer becomes default
     gradient_clip_val: 1.0
 
     # can be used to register any custom metrics that require token-by-token generation
@@ -57,6 +59,7 @@ model:
   seed: 1234
   tensor_model_parallel_size: 1 # intra-layer model parallelism
   pipeline_model_parallel_size: 1 # inter-layer model parallelism
+  context_parallel_size: 1 # parallelism along sequence length
   restore_from_path: ??? # Path to an existing p-tuned/prompt tuned .nemo model you wish to add new tasks to or run inference with
   resume_from_checkpoint: null # The path to a checkpoint file to continue the training, restores the whole state including the epoch, step, LR schedulers, apex, etc.
   save_nemo_on_validation_end: True # Saves an inference ready .nemo file every time a checkpoint is saved during training.

diff --git a/examples/nlp/gpt/conf/gpt_spin.yaml b/examples/nlp/gpt/conf/gpt_spin.yaml
@@ -6,6 +6,7 @@ trainer:
   devices: 8
   accelerator: gpu
   precision: bf16-mixed
+  gradient_clip_val: ${trainer.spin.gradient_clip_val}  # No need to change. Megatron Core optimizer uses this value
 
   # spin specific args
   spin:
@@ -18,6 +19,7 @@ trainer:
 
     # how many GBS we loop over
     limit_val_batches: 1.0
+    # TODO: delete once Megatron Core optimizer becomes default
     gradient_clip_val: 1.0
 
   # do not change these
@@ -93,6 +95,7 @@ model:
     name: distributed_fused_adam
     bucket_cap_mb: 200
     overlap_grad_sync: False
+    overlap_param_sync: False
     contiguous_grad_buffer: True
     lr: 9e-6
     weight_decay: 0.1 

diff --git a/examples/nlp/gpt/conf/training_rm.yaml b/examples/nlp/gpt/conf/training_rm.yaml
@@ -6,6 +6,7 @@ trainer:
   devices: 8
   accelerator: gpu
   precision: bf16
+  gradient_clip_val: ${trainer.rm.gradient_clip_val}  # No need to change. Megatron Core optimizer uses this value
 
   # rm specific args
   rm:
@@ -20,6 +21,7 @@ trainer:
     # set to float for a percentage
     # of the validation dataset
     limit_val_batches: 1.0
+    # TODO: delete once Megatron Core optimizer becomes default
     gradient_clip_val: 1.0
 
   # do not change these
@@ -91,6 +93,7 @@ model:
     name: distributed_fused_adam
     bucket_cap_mb: 200
     overlap_grad_sync: False
+    overlap_param_sync: False
     contiguous_grad_buffer: True
     lr: 9e-6
     weight_decay: 0.1 

diff --git a/examples/nlp/gpt/serve_reward_model.py b/examples/nlp/gpt/serve_reward_model.py
@@ -13,7 +13,7 @@
 # limitations under the License.
 
 import torch
-from pytorch_lightning.trainer.trainer import Trainer
+from lightning.pytorch.trainer.trainer import Trainer
 
 from nemo.collections.nlp.parts.nlp_overrides import NLPDDPStrategy
 from nemo.core.config import hydra_runner

diff --git a/examples/nlp/gpt/train_gpt_sft.py b/examples/nlp/gpt/train_gpt_sft.py
@@ -105,6 +105,7 @@ def main(cfg) -> None:
         answer_only_loss=True,
         is_chat=cfg.model.data.chat,
         special_tokens=cfg.model.data.chat_prompt_tokens,
+        model_cfg=cfg.model,
     )
     if cfg.model.data.get("sample", False):
         num_samples = cfg.trainer.sft.limit_val_batches * val_data_cfg.global_batch_size
@@ -117,6 +118,7 @@ def main(cfg) -> None:
         answer_only_loss=True,
         is_chat=cfg.model.data.chat,
         special_tokens=cfg.model.data.chat_prompt_tokens,
+        model_cfg=cfg.model,
     )
 
     train_dataloader = build_dataloader(

diff --git a/nemo_aligner/algorithms/critic_server_trainer.py b/nemo_aligner/algorithms/critic_server_trainer.py
@@ -322,7 +322,7 @@ def run_training(self, tokens=None, returns=None, prev_values=None, mask=None):
             grad_norm = grad_norm.item() if torch.is_tensor(grad_norm) else grad_norm
             lr = self.optimizer.param_groups[0]["lr"]
 
-            self.optimizer.step()
+            self.optimizer.step(closure=None)
             self.scheduler.step()
 
             if grad_norm is not None:

diff --git a/nemo_aligner/algorithms/dpo.py b/nemo_aligner/algorithms/dpo.py
@@ -220,7 +220,7 @@ def train_single_step(self, global_batch):
         grad_norm = grad_norm.item() if torch.is_tensor(grad_norm) else grad_norm
         lr = self.optimizer.param_groups[0]["lr"]
 
-        self.optimizer.step()
+        self.optimizer.step(closure=None)
         self.scheduler.step()
 
         trainer_metrics = {}

diff --git a/nemo_aligner/algorithms/ppo.py b/nemo_aligner/algorithms/ppo.py
@@ -440,7 +440,7 @@ def run_training(self, dataloader_iter):
             grad_norm = grad_norm.item() if torch.is_tensor(grad_norm) else grad_norm
             lr = self.optimizer.param_groups[0]["lr"]
 
-            self.optimizer.step()
+            self.optimizer.step(closure=None)
             self.scheduler.step()
 
             if grad_norm is not None: