NVIDIA · Kipok · Feb 10, 2025 · Feb 7, 2025 · Feb 7, 2025 · Feb 7, 2025
diff --git a/docs/openmathinstruct2/dataset.md b/docs/openmathinstruct2/dataset.md
@@ -254,7 +254,7 @@ To avoid the models from generating extremely short solutions, we remove solutio
 
 ```bash
 ns run_cmd --cluster=slurm \
-python -m nemo_skills.training.prepare_sft_data \
+python -m nemo_skills.training.prepare_data \
     ++prompt_template=llama3-instruct \
     ++prompt_config=generic/math \
     ++input_files=\'/workspace/solution-augmentation/**/output-rs*.jsonl,/workspace/new-problems-solution-augmentation/**/output-rs*.jsonl\' \

diff --git a/docs/openmathinstruct2/training.md b/docs/openmathinstruct2/training.md
@@ -35,7 +35,7 @@ Convert the data into the SFT format that NeMo-Aligner understands.
 
 ```bash
 ns run_cmd --cluster=local \
-python -m nemo_skills.training.prepare_sft_data \
+python -m nemo_skills.training.prepare_data \
     ++prompt_template=llama3-instruct \
     ++prompt_config=generic/math \
     ++preprocessed_dataset_files=/workspace/openmathinstruct2.jsonl \

diff --git a/docs/pipelines/decontamination.md b/docs/pipelines/decontamination.md
@@ -74,7 +74,7 @@ If you want instead to clean your training data from contaminated examples all t
 you need to swap values for the `retrieve_from` and `compare_to` arguments in the `retrieve_similar` step
 since we now want to make a check for each training set example and find closest test set problems.
 
-After you get `/workspace/math-contamination-results.jsonl`, you can pass it into [prepare_sft_data command](training.md#preparing-the-data)
+After you get `/workspace/math-contamination-results.jsonl`, you can pass it into [prepare_data command](training.md#preparing-the-data)
 with `++contamination_file=...` option.
 
 See a more detailed example in [OpenMathInstruct-2 dataset construction pipeline](../openmathinstruct2/dataset.md#decontamination).
diff --git a/docs/pipelines/training.md b/docs/pipelines/training.md
@@ -12,7 +12,7 @@
 Before running the training we need to prepare the data in the right format. Here is an example command
 
 ```bash
-python -m nemo_skills.training.prepare_sft_data \
+python -m nemo_skills.training.prepare_data \
     ++input_files="<path to the generated synthetic data>/output-rs*.jsonl"> \
     ++output_path=sft-data.jsonl \
     ++prompt_config=generic/math \
@@ -28,7 +28,7 @@ If you want to run that command inside container or on cluster, add `ns run_cmd
 
 You need to pass in the config/template files so that we can format the data accordingly. There are many more parameters
 that data preparation script supports which you can see
-[here](https://github.com/NVIDIA/NeMo-Skills/blob/main/nemo_skills/training/data_preparation_utils/prepare_sft_data.yaml).
+[here](https://github.com/NVIDIA/NeMo-Skills/blob/main/nemo_skills/training/data_preparation_utils/math_sft.yaml).
 We are using [SDP library](https://github.com/NVIDIA/NeMo-speech-data-processor) for preparing the data, so it's
 a good idea to check their documentation to understand how this config is structured.
 

diff --git a/nemo_skills/inference/llm_math_judge.py b/nemo_skills/inference/llm_math_judge.py
@@ -27,7 +27,7 @@
 from nemo_skills.inference.generate import InferenceConfig
 from nemo_skills.inference.server.code_execution_model import get_code_execution_model, get_model, server_params
 from nemo_skills.prompt.utils import get_prompt
-from nemo_skills.utils import get_help_message, nested_dataclass, setup_logging, unroll_files
+from nemo_skills.utils import get_help_message, nested_dataclass, prefill_judgement, setup_logging
 
 LOG = logging.getLogger(__file__)
 
@@ -97,17 +97,6 @@ def __post_init__(self):
 cs.store(name="base_llm_math_judge_config", node=LlmMathJudgeConfig)
 
 
-def prefill_judgement(data_point: dict) -> str | None:
-    """Will automatically fill judgement if there is an exact match or the answer is None."""
-    if data_point['predicted_answer'] is None:
-        return "Reasoning: No answer was provided.\nJudgement: No"
-
-    if str(data_point['predicted_answer']).strip() == str(data_point['expected_answer']).strip():
-        return "Reasoning: The two answers are identical.\nJudgement: Yes"
-
-    return None
-
-
 @hydra.main(version_base=None, config_name='base_llm_math_judge_config', config_path='.')
 def llm_math_judge(cfg: LlmMathJudgeConfig):
     cfg = LlmMathJudgeConfig(_init_nested=True, **cfg)

diff --git a/...paration_utils/prepare_code_sft_data.yaml → ...ning/data_preparation_utils/code_sft.yaml b/...paration_utils/prepare_code_sft_data.yaml → ...ning/data_preparation_utils/code_sft.yaml
diff --git a/nemo_skills/training/data_preparation_utils/filters.py b/nemo_skills/training/data_preparation_utils/filters.py
@@ -99,6 +99,26 @@ def process_dataset_entry(self, data_entry) -> List:
         return [DataEntry(data=data_entry, metrics=dict(num_reomoved=0))]
 
 
+class DropIfEqual(BaseFilter):
+    """Drops data if entry matches provided value."""
+
+    def __init__(
+        self,
+        values,
+        key: str,
+        **kwargs,
+    ):
+        super().__init__(**kwargs)
+        self.values = values
+        self.key = key
+
+    def process_dataset_entry(self, data_entry) -> List:
+        for value in self.values:
+            if data_entry[self.key] == value:
+                return [DataEntry(data=None, metrics=dict(num_removed=1))]
+        return [DataEntry(data=data_entry, metrics=dict(num_reomoved=0))]
+
+
 class DropMultiBoxed(BaseFilter):
     def __init__(self, solution_key: str = "generation", **kwargs):
         super().__init__(**kwargs)

diff --git a/nemo_skills/training/data_preparation_utils/math_rl.yaml b/nemo_skills/training/data_preparation_utils/math_rl.yaml
@@ -0,0 +1,90 @@
+processors_to_run: all
+
+output_path: ???
+prompt_config: null
+prompt_template: null
+
+preprocessed_dataset_files: null  # can specify multiple patters separated by space
+input_files: null  # can specify datasets from HF instead of prediction_jsonl_files
+
+exclude_optional_keys: true # exclude keys other than input, output and expected_answer from the final manifest
+metadata: null  # can provide additional metadata to store (e.g. dataset or generation_type)
+skip_first: 0  # useful for skipping validation set from train_full generation (it's always first)
+
+random_seed: 42
+do_shuffle: true
+num_output_samples: null
+
+# Params for length-based filtering
+# Unit of length - Tokens
+hf_model_name: null
+
+# Problem params
+min_problem_length: 0
+max_problem_length: 1024
+
+input_key: problem
+
+contamination_file: null
+
+majority_filter:
+  # minimum number of majority votes to use the answer
+  min_majority_votes: 0
+  # minimum portion of majority votes to use the answer
+  min_majority_percentage: 0.0
+
+filters:
+  remove_contaminated: true
+  remove_len_outlier_problems: true
+  majority_filter: false
+  drop_none_answers: true
+
+deduplicate: true
+
+processors:
+  - _target_: nemo_skills.training.data_preparation_utils.preprocessing.ReadData
+    preprocessed_dataset_files: ${preprocessed_dataset_files}
+    input_files: ${input_files}
+    input_key: ${input_key}
+    skip_first: ${skip_first}
+    deduplicate: ${deduplicate}
+    keys_to_keep:  # input/output/is_correct/judgement are automatically added
+      - "expected_answer"
+      - "total_votes"
+      - "majority_votes"
+
+  # this will optimize processors inside to avoid serializing data to disk
+  - _target_: nemo_skills.training.data_preparation_utils.merge_processor.MergeProcessor
+    _recursive_: False
+    processor_configs:
+      - _target_: nemo_skills.training.data_preparation_utils.filters.DropIfEqual
+        should_run: ${filters.drop_none_answers}
+        key: expected_answer
+        values: [null, ""]
+
+      - _target_: nemo_skills.training.data_preparation_utils.filters.RemoveContaminated
+        should_run: ${filters.remove_contaminated}
+        contamination_file: ${contamination_file}
+
+      - _target_: nemo_skills.training.data_preparation_utils.filters.MajorityFilter
+        should_run: ${filters.majority_filter}
+        min_majority_votes: ${majority_filter.min_majority_votes}
+        min_majority_percentage: ${majority_filter.min_majority_percentage}
+
+      - _target_: nemo_skills.training.data_preparation_utils.filters.RemoveLenOutliers
+        should_run: ${filters.remove_len_outlier_problems}
+        property_key: ${input_key}
+        min_length: ${min_problem_length}
+        max_length: ${max_problem_length}
+        hf_model_name: ${hf_model_name}
+
+  - _target_: nemo_skills.training.data_preparation_utils.preprocessing.WriteFinalRLManifest
+    output_manifest_file: ${output_path}
+    prompt_config: ${prompt_config}
+    prompt_template: ${prompt_template}
+    input_key: ${input_key}
+    metadata: ${metadata}
+    exclude_optional_keys: ${exclude_optional_keys}
+    random_seed: ${random_seed}
+    do_shuffle: ${do_shuffle}
+    num_output_samples: ${num_output_samples}
diff --git a/...a_preparation_utils/prepare_sft_data.yaml → ...ning/data_preparation_utils/math_sft.yaml b/...a_preparation_utils/prepare_sft_data.yaml → ...ning/data_preparation_utils/math_sft.yaml