NVIDIA · i-vainn · Nov 5, 2024 · Nov 5, 2024 · Nov 6, 2024 · Nov 7, 2024
diff --git a/nemo_skills/inference/generate.py b/nemo_skills/inference/generate.py
@@ -41,6 +41,7 @@ class InferenceConfig:
     random_seed: int = 0
     tokens_to_generate: int = 2048
     repetition_penalty: float = 1.0
+    top_logprobs: int | None = None
 
 
 @nested_dataclass(kw_only=True)
@@ -56,6 +57,7 @@ class GenerateSolutionsConfig:
     prompt_template: str | None = None  # not required for OpenAI server
     prompt_config: str | None = None  # we will fetch it from dataset dir if not provided
     prefix_generation_to_response: bool = False  # whether to include "generation" as prefix to the response
+    continue_prefix_generation: bool = False  # if True, model will be prompted to continue "generation" without closing assistant tag
 
     examples_type: str | None = None  # to be able to customize few-shot examples
     inference: InferenceConfig = field(default_factory=InferenceConfig)  # LLM call parameters
@@ -318,6 +320,7 @@ def fill_prompt(self, data_point, data):
             data_point,
             multi_turn_key=self.cfg.multi_turn_key,
             prefix_generation_to_response=self.cfg.prefix_generation_to_response,
+            continue_prefix_generation=self.cfg.continue_prefix_generation,
         )
 
     def llm_generate(self, data_points, data, is_async=False):

diff --git a/nemo_skills/inference/server/code_execution_model.py b/nemo_skills/inference/server/code_execution_model.py
@@ -61,9 +61,12 @@ def _generate_single(
         repetition_penalty: float,
         random_seed: int,
         stop_phrases: list[str] | None = None,
+        top_logprobs: int | None = None,
     ):
         if not isinstance(prompt, str):
             raise NotImplementedError("OpenAI API is not supported yet.")
+        if top_logprobs is not None:  # TODO: add this
+            raise NotImplementedError("top_logprobs is not supported yet.")
 
         if stop_phrases is None:
             stop_phrases = []
@@ -133,6 +136,7 @@ def generate_async(
         random_seed: int | list[int] = 0,
         stop_phrases: list[str] | list[list[str]] | None = None,
         remove_stop_phrases: bool = True,
+        top_logprobs: int | list[int] | None = None,
     ) -> list[dict]:
         """For any generation parameter you can specify a list of values that needs to match the number of prompts.
 
@@ -141,6 +145,8 @@ def generate_async(
         # TODO: currently nemo server would get separate 1-batch requests, which is likely really inefficient
         #       but the alternative is to have a fully separate implementation, which is also not nice
         #       If we find ourselves needing to use nemo with code execution often, we should fix this
+        if top_logprobs is not None:  # TODO: add this
+            raise NotImplementedError("top_logprobs is not supported yet.")
         kwargs = {
             'code_begin': code_begin,
             'code_end': code_end,