NVIDIA · i-vainn · Nov 5, 2024 · Nov 5, 2024 · Nov 6, 2024 · Nov 7, 2024
diff --git a/nemo_skills/inference/generate.py b/nemo_skills/inference/generate.py
@@ -42,6 +42,7 @@ class InferenceConfig:
     random_seed: int = 0
     tokens_to_generate: int = 2048
     repetition_penalty: float = 1.0
+    top_logprobs: int | None = None
 
 
 @nested_dataclass(kw_only=True)
@@ -76,6 +77,7 @@ class GenerateSolutionsConfig:
     chunk_id: int | None = None  # if specified, will index the specified chunk only
 
     generation_key: str = "generation"
+    partial_generation: bool = False  # if True, model will be prompted to continue "generation" without closing assistant tag
     # if specified, we will have a loop over that key in the data file and
     # treat each element as a new turn of conversation
     # E.g. if multi_turn_key="turns" and a line in your data file has
@@ -178,7 +180,7 @@ def sync_loop(cfg, data, llm, prompt, extra_stop_phrases, extra_generate_params)
             if len(data_points) == cfg.batch_size or idx == len(data) - 1:
                 if cfg.multi_turn_key is None:
                     outputs = llm.generate(
-                        prompts=[prompt.fill(dp, include_generation=cfg.include_generation) for dp in data_points],
+                        prompts=[prompt.fill(dp, include_generation=cfg.include_generation, partial_generation=cfg.partial_generation) for dp in data_points],
                         stop_phrases=combine_stop_phrases(prompt.stop_phrases, extra_stop_phrases),
                         **asdict(cfg.inference),
                         **extra_generate_params,
@@ -206,7 +208,7 @@ def sync_loop(cfg, data, llm, prompt, extra_stop_phrases, extra_generate_params)
                         # getting a new set of generations
                         turn_outputs = llm.generate(
                             prompts=[
-                                prompt.fill(turn_data_points[dp_index], multi_turn_key=cfg.multi_turn_key, include_generation=cfg.include_generation)
+                                prompt.fill(turn_data_points[dp_index], multi_turn_key=cfg.multi_turn_key, include_generation=cfg.include_generation, partial_generation=cfg.partial_generation)
                                 for dp_index in dp_indices
                             ],
                             stop_phrases=combine_stop_phrases(prompt.stop_phrases, extra_stop_phrases),
@@ -276,7 +278,7 @@ def async_loop(cfg, data, llm, prompt, extra_stop_phrases, extra_generate_params
             # Dynamic sending requests to maintain cfg.max_concurrent_requests running requests
             num_to_submit = min(cfg.max_concurrent_requests - len(in_progress), len(request_queue))
             batch_indices = [request_queue.popleft() for _ in range(num_to_submit)]
-            batch_prompts = [prompt.fill(data[idx], include_generation=cfg.include_generation) for idx in batch_indices]
+            batch_prompts = [prompt.fill(data[idx], include_generation=cfg.include_generation, partial_generation=cfg.partial_generation) for idx in batch_indices]
 
             if len(batch_prompts) > 0:
                 generation_ids = llm.generate_async(
@@ -382,7 +384,7 @@ def generate(cfg: GenerateSolutionsConfig):
     LOG.info("Prompt used: %s", prompt)
 
     if cfg.multi_turn_key is None:
-        LOG.info("Example prompt:\nData dictionary: %s\nPrompt: %s", data[0], prompt.fill(data[0]))
+        LOG.info("Example prompt:\nData dictionary: %s\nPrompt: %s", data[0], prompt.fill(data[0], include_generation=cfg.include_generation, partial_generation=cfg.partial_generation))
     else:
         first_sample = deepcopy(data[0])
         first_sample[cfg.multi_turn_key] = first_sample[cfg.multi_turn_key][:1]

diff --git a/nemo_skills/inference/reward_model.py b/nemo_skills/inference/reward_model.py
@@ -98,6 +98,8 @@ def generate(cfg: RewardModelConfig):
     LOG.info("Config used: %s", cfg)
     llm = get_reward_model(model_type=cfg.reward_model_type, **cfg.server)
 
+    rm_type = cfg.server['rm_type']
+
     # making sure output dir exists
     Path(cfg.output_file).absolute().parent.mkdir(parents=True, exist_ok=True)
 
@@ -137,13 +139,21 @@ def generate(cfg: RewardModelConfig):
     if len(data) == 0:  # we might not have any examples if skip_filled=True
         return
 
-    LOG.info(
-        "Example prompt:\nData dictionary: %s\nPrompt: %s", data[0], prompt.fill(data[0], include_generation=True)
-    )
-
     if cfg.dry_run:
         return
 
+    if rm_type == 'disc':
+        include_generation = True
+    else:
+        # The template for GenRM already includes the generation, so we don't need to include it again
+        include_generation = False
+
+    LOG.info(
+        "Example prompt:\nData dictionary: %s\nPrompt: %s",
+        data[0],
+        prompt.fill(data[0], include_generation=include_generation),
+    )
+
     # setting buffering=1 to force to dump the output after every line, so that we can see intermediate generations
     with open(cfg.output_file, "at" if cfg.skip_filled else "wt", encoding="utf-8", buffering=1) as fout:
         data_points = []
@@ -154,7 +164,7 @@ def generate(cfg: RewardModelConfig):
 
             if len(data_points) == cfg.batch_size or idx == cfg.max_samples - 1:
                 outputs = llm.score(
-                    prompts=[prompt.fill(dp, include_generation=True) for dp in data_points],
+                    prompts=[prompt.fill(dp, include_generation=include_generation) for dp in data_points],
                 )
 
                 for output, original_data_point in zip(outputs, data_points):

diff --git a/nemo_skills/inference/server/code_execution_model.py b/nemo_skills/inference/server/code_execution_model.py
@@ -61,9 +61,12 @@ def _generate_single(
         repetition_penalty: float,
         random_seed: int,
         stop_phrases: list[str] | None = None,
+        top_logprobs: int | None = None,
     ):
         if not isinstance(prompt, str):
             raise NotImplementedError("OpenAI API is not supported yet.")
+        if top_logprobs is not None:  # TODO: add this
+            raise NotImplementedError("top_logprobs is not supported yet.")
 
         if stop_phrases is None:
             stop_phrases = []
@@ -133,6 +136,7 @@ def generate_async(
         random_seed: int | list[int] = 0,
         stop_phrases: list[str] | list[list[str]] | None = None,
         remove_stop_phrases: bool = True,
+        top_logprobs: int | list[int] | None = None,
     ) -> list[dict]:
         """For any generation parameter you can specify a list of values that needs to match the number of prompts.
 
@@ -141,6 +145,8 @@ def generate_async(
         # TODO: currently nemo server would get separate 1-batch requests, which is likely really inefficient
         #       but the alternative is to have a fully separate implementation, which is also not nice
         #       If we find ourselves needing to use nemo with code execution often, we should fix this
+        if top_logprobs is not None:  # TODO: add this
+            raise NotImplementedError("top_logprobs is not supported yet.")
         kwargs = {
             'code_begin': code_begin,
             'code_end': code_end,