mozilla-ai · aittalam · Jan 15, 2025 · Jan 8, 2025 · Jan 14, 2025 · Jan 14, 2025
diff --git a/docker-compose.yaml b/docker-compose.yaml
@@ -109,6 +109,10 @@ services:
       - PYTHONPATH=/mzai/lumigator/python/mzai/backend
       - EVALUATOR_PIP_REQS=/mzai/lumigator/python/mzai/jobs/evaluator/requirements.txt
       - EVALUATOR_WORK_DIR=/mzai/lumigator/python/mzai/jobs/evaluator
+      # TODO: the following two rows should be renamed to EVALUATOR_*
+      #       and the two above should be removed when we depreate evaluator
+      - EVALUATOR_LITE_PIP_REQS=/mzai/lumigator/python/mzai/jobs/evaluator_lite/requirements.txt
+      - EVALUATOR_LITE_WORK_DIR=/mzai/lumigator/python/mzai/jobs/evaluator_lite
       - INFERENCE_PIP_REQS=/mzai/lumigator/python/mzai/jobs/inference/requirements.txt
       - INFERENCE_WORK_DIR=/mzai/lumigator/python/mzai/jobs/inference
       - RAY_DASHBOARD_PORT

diff --git a/lumigator/python/mzai/backend/backend/api/routes/jobs.py b/lumigator/python/mzai/backend/backend/api/routes/jobs.py
@@ -11,6 +11,7 @@
     Job,
     JobAnnotateCreate,
     JobEvalCreate,
+    JobEvalLiteCreate,
     JobInferenceCreate,
     JobLogsResponse,
     JobResponse,
@@ -82,6 +83,23 @@ def create_evaluation_job(
     return job_response
 
 
+# TODO: remove the code above and refactor the method below to answer
+#       to "/evaluate/" when we deprecate evaluator
+@router.post("/eval_lite/", status_code=status.HTTP_201_CREATED)
+def create_evaluation_lite_job(
+    service: JobServiceDep,
+    job_create_request: JobEvalLiteCreate,
+    request: Request,
+    response: Response,
+) -> JobResponse:
+    job_response = service.create_job(job_create_request)
+
+    url = request.url_for(get_job.__name__, job_id=job_response.id)
+    response.headers[HttpHeaders.LOCATION] = f"{url}"
+
+    return job_response
+
+
 @router.get("/")
 def list_jobs(
     service: JobServiceDep,

diff --git a/lumigator/python/mzai/backend/backend/config_templates.py b/lumigator/python/mzai/backend/backend/config_templates.py
@@ -68,6 +68,24 @@
     }}
 }}"""
 
+# TODO: this default evaluation template should serve for most purposes
+#       after we deprecate evaluator, as it won't include predictions (model
+#       name is just passed for reference and not for actual inference).
+#       We can remove all the above templates then.
+default_eval_template = """{{
+    "name": "{job_name}/{job_id}",
+    "model": {{ "path": "{model_uri}" }},
+    "dataset": {{ "path": "{dataset_path}" }},
+    "evaluation": {{
+        "metrics": ["rouge", "meteor", "bertscore"],
+        "max_samples": {max_samples},
+        "return_input_data": true,
+        "return_predictions": true,
+        "storage_path": "{storage_path}"
+    }}
+}}"""
+
+
 # Inference templates
 
 default_infer_template = """{{
@@ -153,4 +171,10 @@
         "mistral://open-mistral-7b": oai_eval_template,
         "llamafile://mistralai/Mistral-7B-Instruct-v0.2": oai_eval_template,
     },
+    # TODO: Remove the old EVALUATION section and rename EVALUATION_LITE
+    #       to EVALUATION after we deprecate evaluator. Also remove the
+    #       unused templates above (all the eval templates except default)
+    JobType.EVALUATION_LITE: {
+        "default": default_eval_template,
+    },
 }
diff --git a/lumigator/python/mzai/backend/backend/services/experiments.py b/lumigator/python/mzai/backend/backend/services/experiments.py
@@ -12,7 +12,7 @@
 from lumigator_schemas.experiments import ExperimentCreate, ExperimentResponse
 from lumigator_schemas.extras import ListingResponse
 from lumigator_schemas.jobs import (
-    JobEvalCreate,
+    JobEvalLiteCreate,
     JobInferenceCreate,
     JobStatus,
 )
@@ -138,7 +138,7 @@ def _run_eval(
 
         # submit the job
         self._job_service.create_job(
-            JobEvalCreate.model_validate(job_eval_dict), experiment_id=experiment_id
+            JobEvalLiteCreate.model_validate(job_eval_dict), experiment_id=experiment_id
         )
 
     def create_experiment(

diff --git a/lumigator/python/mzai/backend/backend/services/jobs.py b/lumigator/python/mzai/backend/backend/services/jobs.py
@@ -7,6 +7,7 @@
 from lumigator_schemas.jobs import (
     JobConfig,
     JobEvalCreate,
+    JobEvalLiteCreate,
     JobInferenceCreate,
     JobResponse,
     JobResultDownloadResponse,
@@ -44,6 +45,13 @@ class JobService:
             "ray_worker_gpus_fraction": settings.RAY_WORKER_GPUS_FRACTION,
             "ray_worker_gpus": settings.RAY_WORKER_GPUS,
         },
+        JobType.EVALUATION_LITE: {
+            "command": settings.EVALUATOR_LITE_COMMAND,
+            "pip": settings.EVALUATOR_LITE_PIP_REQS,
+            "work_dir": settings.EVALUATOR_LITE_WORK_DIR,
+            "ray_worker_gpus_fraction": settings.RAY_WORKER_GPUS_FRACTION,
+            "ray_worker_gpus": settings.RAY_WORKER_GPUS,
+        },
     }
 
     def __init__(
@@ -122,8 +130,9 @@ def _get_job_params(self, job_type: str, record, request: BaseModel) -> dict:
         model_url = self._set_model_type(request)
 
         # provide a reasonable system prompt for services where none was specified
-        if request.system_prompt is None and not request.model.startswith("hf://"):
-            request.system_prompt = settings.DEFAULT_SUMMARIZER_PROMPT
+        if job_type == JobType.EVALUATION or job_type == JobType.INFERENCE:
+            if request.system_prompt is None and not request.model.startswith("hf://"):
+                request.system_prompt = settings.DEFAULT_SUMMARIZER_PROMPT
 
         # this section differs between inference and eval
         if job_type == JobType.EVALUATION:
@@ -138,6 +147,15 @@ def _get_job_params(self, job_type: str, record, request: BaseModel) -> dict:
                 "system_prompt": request.system_prompt,
                 "skip_inference": request.skip_inference,
             }
+        elif job_type == JobType.EVALUATION_LITE:
+            job_params = {
+                "job_id": record.id,
+                "job_name": request.name,
+                "model_uri": request.model,
+                "dataset_path": dataset_s3_path,
+                "max_samples": request.max_samples,
+                "storage_path": self.storage_path,
+            }
         else:
             job_params = {
                 "job_id": record.id,
@@ -164,11 +182,15 @@ def _get_job_params(self, job_type: str, record, request: BaseModel) -> dict:
         return job_params
 
     def create_job(
-        self, request: JobEvalCreate | JobInferenceCreate, experiment_id: UUID = None
+        self,
+        request: JobEvalCreate | JobEvalLiteCreate | JobInferenceCreate,
+        experiment_id: UUID = None,
     ) -> JobResponse:
         """Creates a new evaluation workload to run on Ray and returns the response status."""
         if isinstance(request, JobEvalCreate):
             job_type = JobType.EVALUATION
+        elif isinstance(request, JobEvalLiteCreate):
+            job_type = JobType.EVALUATION_LITE
         elif isinstance(request, JobInferenceCreate):
             job_type = JobType.INFERENCE
         else:

diff --git a/lumigator/python/mzai/backend/backend/settings.py b/lumigator/python/mzai/backend/backend/settings.py
@@ -43,8 +43,6 @@ class BackendSettings(BaseSettings):
     # Eval job details
     EVALUATOR_WORK_DIR: str | None = None
     EVALUATOR_PIP_REQS: str | None = None
-    # TODO: change once we remove old eval
-    NEW_EVALUATOR_COMMAND: str = "python evaluator/evaluator.py"
 
     @computed_field
     @property
@@ -65,6 +63,12 @@ def EVALUATOR_COMMAND_WITH_LD_PRELOAD(self) -> str:  # noqa: N802
         """
         return f"{self.LD_PRELOAD_PREFIX} {self.NEW_EVALUATOR_COMMAND}"
 
+    # TODO: the following should all be refactored to EVALUATOR_* and the above should
+    #       be removed when we deprecate evaluator
+    EVALUATOR_LITE_WORK_DIR: str | None = None
+    EVALUATOR_LITE_PIP_REQS: str | None = None
+    EVALUATOR_LITE_COMMAND: str = "python eval_lite.py"
+
     # Inference job details
     INFERENCE_WORK_DIR: str | None = None
     INFERENCE_PIP_REQS: str | None = None

diff --git a/lumigator/python/mzai/jobs/evaluator_lite/eval_lite.py b/lumigator/python/mzai/jobs/evaluator_lite/eval_lite.py
@@ -4,8 +4,8 @@
 import click
 import s3fs
 from datasets import load_from_disk
+from eval_config import EvalJobConfig
 from eval_metrics import EvaluationMetrics
-from evaluate_config import EvalJobConfig
 from loguru import logger
 
 
@@ -16,18 +16,21 @@ def save_to_disk(local_path: Path, data_dict: dict):
         json.dump(data_dict, f)
 
 
-def save_to_s3(config: dict, local_path: Path, storage_path: str):
+def save_to_s3(config: EvalJobConfig, local_path: Path, storage_path: str):
     s3 = s3fs.S3FileSystem()
     if storage_path.endswith("/"):
-        storage_path = "s3://" + str(Path(storage_path[5:]) / config.get("name") / "results.json")
-    logger.info(f"Storing evaluation results into {local_path}...")
+        storage_path = "s3://" + str(Path(storage_path[5:]) / config.name / "results.json")
+    logger.info(f"Storing evaluation results into {storage_path}...")
     s3.put_file(local_path, storage_path)
 
 
-def save_outputs(config: dict, eval_results: dict) -> Path:
-    storage_path = config.get("evaluation").get("storage_path")
+def save_outputs(config: EvalJobConfig, eval_results: dict) -> Path:
+    storage_path = config.evaluation.storage_path
 
-    local_path = Path("results.json")
+    # generate local temp file ANYWAY:
+    # - if storage_path is not provided, it will be stored and kept into a default dir
+    # - if storage_path is provided AND saving to S3 is successful, local file is deleted
+    local_path = Path(Path.home() / ".lumigator" / "results" / config.name / "results.json")
 
     try:
         save_to_disk(local_path, eval_results)
@@ -58,11 +61,12 @@ def run_eval(config: EvalJobConfig) -> Path:
     # Load dataset given its URI
     dataset = load_from_disk(config.dataset.path)
     logger.info(f"Retrieving {config.dataset.path} for evaluation")
-    if max_samples:
+
+    # Limit dataset length if max_samples is specified
+    if max_samples < 1 or max_samples > len(dataset):
         logger.info(f"max_samples ({max_samples}) resized to dataset size ({len(dataset)})")
-        # select data between the minimum and total length of dataset
-        num_samples = range(min(max_samples, len(dataset)))
-        dataset = dataset.select(num_samples)
+        max_samples = len(dataset)
+    dataset = dataset.select(range(max_samples))
 
     # run evaluation and append to results dict
     predictions = dataset["predictions"]

diff --git a/lumigator/python/mzai/jobs/evaluator_lite/requirements.txt b/lumigator/python/mzai/jobs/evaluator_lite/requirements.txt
@@ -1,22 +1,17 @@
-accelerate==0.33.0
+absl-py==2.1.0
 bert_score==0.3.13
-bitsandbytes==0.42.0
 click>=8.1.7
 datasets==2.20.0
-einops==0.8.0
 evaluate==0.4.2
 loguru==0.7.2
-mistralai==0.4.2
 nltk==3.8.1
 numpy<2.0.0
-openai==1.38.0
-protobuf>=3.20.2
+platformdirs>=2.1
 pydantic-yaml>=1.2.0
 pydantic>=2.6.4
-ray[default]==2.30.0
+rouge-score==0.1.2
 ruff==0.5.5
 s3fs==2024.5.0
-scipy==1.13.1
-sentencepiece==0.2.0
-transformers==4.43.4
-urllib3>=1.26.18,<2
+six>=1.14
+transformers==4.48.0
+urllib3==2.3.0
diff --git a/lumigator/python/mzai/schemas/lumigator_schemas/jobs.py b/lumigator/python/mzai/schemas/lumigator_schemas/jobs.py
@@ -9,6 +9,7 @@
 class JobType(str, Enum):
     INFERENCE = "inference"
     EVALUATION = "evaluate"
+    EVALUATION_LITE = "eval_lite"
 
 
 class JobStatus(str, Enum):
@@ -66,6 +67,17 @@ class JobEvalCreate(BaseModel):
     skip_inference: bool = False
 
 
+# TODO: this has to be renamed to JobEvalCreate and the code above
+#       has to be removed when we deprecate evaluator
+class JobEvalLiteCreate(BaseModel):
+    name: str
+    description: str = ""
+    model: str
+    dataset: UUID
+    max_samples: int = -1  # set to all samples by default
+    config_template: str | None = None
+
+
 class JobInferenceCreate(BaseModel):
     name: str
     description: str = ""