From a14e60b1f024436dc1e28a8a42edeb42da361226 Mon Sep 17 00:00:00 2001 From: Davide Eynard Date: Wed, 8 Jan 2025 10:42:59 +0000 Subject: [PATCH 1/5] Minor fixes to run the tool from the command line --- .../mzai/jobs/evaluator_lite/eval_lite.py | 17 ++++++++++------- .../mzai/jobs/evaluator_lite/requirements.txt | 3 +++ 2 files changed, 13 insertions(+), 7 deletions(-) diff --git a/lumigator/python/mzai/jobs/evaluator_lite/eval_lite.py b/lumigator/python/mzai/jobs/evaluator_lite/eval_lite.py index f065c2392..f16d7db7a 100644 --- a/lumigator/python/mzai/jobs/evaluator_lite/eval_lite.py +++ b/lumigator/python/mzai/jobs/evaluator_lite/eval_lite.py @@ -4,8 +4,8 @@ import click import s3fs from datasets import load_from_disk +from eval_config import EvalJobConfig from eval_metrics import EvaluationMetrics -from evaluate_config import EvalJobConfig from loguru import logger @@ -16,18 +16,21 @@ def save_to_disk(local_path: Path, data_dict: dict): json.dump(data_dict, f) -def save_to_s3(config: dict, local_path: Path, storage_path: str): +def save_to_s3(config: EvalJobConfig, local_path: Path, storage_path: str): s3 = s3fs.S3FileSystem() if storage_path.endswith("/"): - storage_path = "s3://" + str(Path(storage_path[5:]) / config.get("name") / "results.json") - logger.info(f"Storing evaluation results into {local_path}...") + storage_path = "s3://" + str(Path(storage_path[5:]) / config.name / "results.json") + logger.info(f"Storing evaluation results into {storage_path}...") s3.put_file(local_path, storage_path) -def save_outputs(config: dict, eval_results: dict) -> Path: - storage_path = config.get("evaluation").get("storage_path") +def save_outputs(config: EvalJobConfig, eval_results: dict) -> Path: + storage_path = config.evaluation.storage_path - local_path = Path("results.json") + # generate local temp file ANYWAY: + # - if storage_path is not provided, it will be stored and kept into a default dir + # - if storage_path is provided AND saving to S3 is successful, local file is deleted + local_path = Path(Path.home() / ".lumigator" / "results" / config.name / "results.json") try: save_to_disk(local_path, eval_results) diff --git a/lumigator/python/mzai/jobs/evaluator_lite/requirements.txt b/lumigator/python/mzai/jobs/evaluator_lite/requirements.txt index 59b266244..a52b0c8a1 100644 --- a/lumigator/python/mzai/jobs/evaluator_lite/requirements.txt +++ b/lumigator/python/mzai/jobs/evaluator_lite/requirements.txt @@ -1,3 +1,4 @@ +absl-py==2.1.0 accelerate==0.33.0 bert_score==0.3.13 bitsandbytes==0.42.0 @@ -14,9 +15,11 @@ protobuf>=3.20.2 pydantic-yaml>=1.2.0 pydantic>=2.6.4 ray[default]==2.30.0 +rouge-score==0.1.2 ruff==0.5.5 s3fs==2024.5.0 scipy==1.13.1 sentencepiece==0.2.0 +six>=1.14 transformers==4.43.4 urllib3>=1.26.18,<2 From ce9ad4cfbfbcb6e0cd652f2f4d647530e413b9f4 Mon Sep 17 00:00:00 2001 From: Davide Eynard Date: Tue, 14 Jan 2025 07:34:30 +0000 Subject: [PATCH 2/5] Cleaned + updated deps in requirements.txt --- .../mzai/jobs/evaluator_lite/requirements.txt | 14 +++----------- 1 file changed, 3 insertions(+), 11 deletions(-) diff --git a/lumigator/python/mzai/jobs/evaluator_lite/requirements.txt b/lumigator/python/mzai/jobs/evaluator_lite/requirements.txt index a52b0c8a1..c8f1efbe8 100644 --- a/lumigator/python/mzai/jobs/evaluator_lite/requirements.txt +++ b/lumigator/python/mzai/jobs/evaluator_lite/requirements.txt @@ -1,25 +1,17 @@ absl-py==2.1.0 -accelerate==0.33.0 bert_score==0.3.13 -bitsandbytes==0.42.0 click>=8.1.7 datasets==2.20.0 -einops==0.8.0 evaluate==0.4.2 loguru==0.7.2 -mistralai==0.4.2 nltk==3.8.1 numpy<2.0.0 -openai==1.38.0 -protobuf>=3.20.2 +platformdirs>=2.1 pydantic-yaml>=1.2.0 pydantic>=2.6.4 -ray[default]==2.30.0 rouge-score==0.1.2 ruff==0.5.5 s3fs==2024.5.0 -scipy==1.13.1 -sentencepiece==0.2.0 six>=1.14 -transformers==4.43.4 -urllib3>=1.26.18,<2 +transformers==4.48.0 +urllib3==2.3.0 From 7628db96570c71dfec3bbc069da2fa5bf050abc7 Mon Sep 17 00:00:00 2001 From: Davide Eynard Date: Tue, 14 Jan 2025 12:41:58 +0000 Subject: [PATCH 3/5] Integrate eval_lite as a new job This commit integrates eval_lite as a separate, new job. This is done to make sure that while we add this, everything still works as expected. Once everything is tested and found in good status, we can deprecate the older evaluator-based approach in favor of the two inference + evaluation jobs and remove the extra code. TODOs have been added with the purpose of finding the parts of code we need to remove more easily --- docker-compose.yaml | 4 +++ .../mzai/backend/backend/api/routes/jobs.py | 18 ++++++++++++ .../mzai/backend/backend/config_templates.py | 24 ++++++++++++++++ .../backend/backend/services/experiments.py | 4 +-- .../mzai/backend/backend/services/jobs.py | 28 +++++++++++++++++-- .../python/mzai/backend/backend/settings.py | 8 ++++-- .../mzai/schemas/lumigator_schemas/jobs.py | 12 ++++++++ 7 files changed, 91 insertions(+), 7 deletions(-) diff --git a/docker-compose.yaml b/docker-compose.yaml index 1af33ccd9..fe2a1cb68 100644 --- a/docker-compose.yaml +++ b/docker-compose.yaml @@ -109,6 +109,10 @@ services: - PYTHONPATH=/mzai/lumigator/python/mzai/backend - EVALUATOR_PIP_REQS=/mzai/lumigator/python/mzai/jobs/evaluator/requirements.txt - EVALUATOR_WORK_DIR=/mzai/lumigator/python/mzai/jobs/evaluator + # TODO: the following two rows should be renamed to EVALUATOR_* + # and the two above should be removed when we depreate evaluator + - EVALUATOR_LITE_PIP_REQS=/mzai/lumigator/python/mzai/jobs/evaluator_lite/requirements.txt + - EVALUATOR_LITE_WORK_DIR=/mzai/lumigator/python/mzai/jobs/evaluator_lite - INFERENCE_PIP_REQS=/mzai/lumigator/python/mzai/jobs/inference/requirements.txt - INFERENCE_WORK_DIR=/mzai/lumigator/python/mzai/jobs/inference - RAY_DASHBOARD_PORT diff --git a/lumigator/python/mzai/backend/backend/api/routes/jobs.py b/lumigator/python/mzai/backend/backend/api/routes/jobs.py index 05eeb5817..1dd5bf751 100644 --- a/lumigator/python/mzai/backend/backend/api/routes/jobs.py +++ b/lumigator/python/mzai/backend/backend/api/routes/jobs.py @@ -11,6 +11,7 @@ Job, JobAnnotateCreate, JobEvalCreate, + JobEvalLiteCreate, JobInferenceCreate, JobLogsResponse, JobResponse, @@ -82,6 +83,23 @@ def create_evaluation_job( return job_response +# TODO: remove the code above and refactor the method below to answer +# to "/evaluate/" when we deprecate evaluator +@router.post("/eval_lite/", status_code=status.HTTP_201_CREATED) +def create_evaluation_lite_job( + service: JobServiceDep, + job_create_request: JobEvalLiteCreate, + request: Request, + response: Response, +) -> JobResponse: + job_response = service.create_job(job_create_request) + + url = request.url_for(get_job.__name__, job_id=job_response.id) + response.headers[HttpHeaders.LOCATION] = f"{url}" + + return job_response + + @router.get("/") def list_jobs( service: JobServiceDep, diff --git a/lumigator/python/mzai/backend/backend/config_templates.py b/lumigator/python/mzai/backend/backend/config_templates.py index 473e0a442..1133aed0f 100644 --- a/lumigator/python/mzai/backend/backend/config_templates.py +++ b/lumigator/python/mzai/backend/backend/config_templates.py @@ -68,6 +68,24 @@ }} }}""" +# TODO: this default evaluation template should serve for most purposes +# after we deprecate evaluator, as it won't include predictions (model +# name is just passed for reference and not for actual inference). +# We can remove all the above templates then. +default_eval_template = """{{ + "name": "{job_name}/{job_id}", + "model": {{ "path": "{model_uri}" }}, + "dataset": {{ "path": "{dataset_path}" }}, + "evaluation": {{ + "metrics": ["rouge", "meteor", "bertscore"], + "max_samples": {max_samples}, + "return_input_data": true, + "return_predictions": true, + "storage_path": "{storage_path}" + }} +}}""" + + # Inference templates default_infer_template = """{{ @@ -153,4 +171,10 @@ "mistral://open-mistral-7b": oai_eval_template, "llamafile://mistralai/Mistral-7B-Instruct-v0.2": oai_eval_template, }, + # TODO: Remove the old EVALUATION section and rename EVALUATION_LITE + # to EVALUATION after we deprecate evaluator. Also remove the + # unused templates above (all the eval templates except default) + JobType.EVALUATION_LITE: { + "default": default_eval_template, + }, } diff --git a/lumigator/python/mzai/backend/backend/services/experiments.py b/lumigator/python/mzai/backend/backend/services/experiments.py index 90febea93..f35a7739c 100644 --- a/lumigator/python/mzai/backend/backend/services/experiments.py +++ b/lumigator/python/mzai/backend/backend/services/experiments.py @@ -12,7 +12,7 @@ from lumigator_schemas.experiments import ExperimentCreate, ExperimentResponse from lumigator_schemas.extras import ListingResponse from lumigator_schemas.jobs import ( - JobEvalCreate, + JobEvalLiteCreate, JobInferenceCreate, JobStatus, ) @@ -138,7 +138,7 @@ def _run_eval( # submit the job self._job_service.create_job( - JobEvalCreate.model_validate(job_eval_dict), experiment_id=experiment_id + JobEvalLiteCreate.model_validate(job_eval_dict), experiment_id=experiment_id ) def create_experiment( diff --git a/lumigator/python/mzai/backend/backend/services/jobs.py b/lumigator/python/mzai/backend/backend/services/jobs.py index 9081b03da..f592d9653 100644 --- a/lumigator/python/mzai/backend/backend/services/jobs.py +++ b/lumigator/python/mzai/backend/backend/services/jobs.py @@ -7,6 +7,7 @@ from lumigator_schemas.jobs import ( JobConfig, JobEvalCreate, + JobEvalLiteCreate, JobInferenceCreate, JobResponse, JobResultDownloadResponse, @@ -44,6 +45,13 @@ class JobService: "ray_worker_gpus_fraction": settings.RAY_WORKER_GPUS_FRACTION, "ray_worker_gpus": settings.RAY_WORKER_GPUS, }, + JobType.EVALUATION_LITE: { + "command": settings.EVALUATOR_LITE_COMMAND, + "pip": settings.EVALUATOR_LITE_PIP_REQS, + "work_dir": settings.EVALUATOR_LITE_WORK_DIR, + "ray_worker_gpus_fraction": settings.RAY_WORKER_GPUS_FRACTION, + "ray_worker_gpus": settings.RAY_WORKER_GPUS, + }, } def __init__( @@ -122,8 +130,9 @@ def _get_job_params(self, job_type: str, record, request: BaseModel) -> dict: model_url = self._set_model_type(request) # provide a reasonable system prompt for services where none was specified - if request.system_prompt is None and not request.model.startswith("hf://"): - request.system_prompt = settings.DEFAULT_SUMMARIZER_PROMPT + if job_type == JobType.EVALUATION or job_type == JobType.INFERENCE: + if request.system_prompt is None and not request.model.startswith("hf://"): + request.system_prompt = settings.DEFAULT_SUMMARIZER_PROMPT # this section differs between inference and eval if job_type == JobType.EVALUATION: @@ -138,6 +147,15 @@ def _get_job_params(self, job_type: str, record, request: BaseModel) -> dict: "system_prompt": request.system_prompt, "skip_inference": request.skip_inference, } + elif job_type == JobType.EVALUATION_LITE: + job_params = { + "job_id": record.id, + "job_name": request.name, + "model_uri": request.model, + "dataset_path": dataset_s3_path, + "max_samples": request.max_samples, + "storage_path": self.storage_path, + } else: job_params = { "job_id": record.id, @@ -164,11 +182,15 @@ def _get_job_params(self, job_type: str, record, request: BaseModel) -> dict: return job_params def create_job( - self, request: JobEvalCreate | JobInferenceCreate, experiment_id: UUID = None + self, + request: JobEvalCreate | JobEvalLiteCreate | JobInferenceCreate, + experiment_id: UUID = None, ) -> JobResponse: """Creates a new evaluation workload to run on Ray and returns the response status.""" if isinstance(request, JobEvalCreate): job_type = JobType.EVALUATION + elif isinstance(request, JobEvalLiteCreate): + job_type = JobType.EVALUATION_LITE elif isinstance(request, JobInferenceCreate): job_type = JobType.INFERENCE else: diff --git a/lumigator/python/mzai/backend/backend/settings.py b/lumigator/python/mzai/backend/backend/settings.py index b165ad93e..4150dbe14 100644 --- a/lumigator/python/mzai/backend/backend/settings.py +++ b/lumigator/python/mzai/backend/backend/settings.py @@ -43,8 +43,6 @@ class BackendSettings(BaseSettings): # Eval job details EVALUATOR_WORK_DIR: str | None = None EVALUATOR_PIP_REQS: str | None = None - # TODO: change once we remove old eval - NEW_EVALUATOR_COMMAND: str = "python evaluator/evaluator.py" @computed_field @property @@ -65,6 +63,12 @@ def EVALUATOR_COMMAND_WITH_LD_PRELOAD(self) -> str: # noqa: N802 """ return f"{self.LD_PRELOAD_PREFIX} {self.NEW_EVALUATOR_COMMAND}" + # TODO: the following should all be refactored to EVALUATOR_* and the above should + # be removed when we deprecate evaluator + EVALUATOR_LITE_WORK_DIR: str | None = None + EVALUATOR_LITE_PIP_REQS: str | None = None + EVALUATOR_LITE_COMMAND: str = "python eval_lite.py" + # Inference job details INFERENCE_WORK_DIR: str | None = None INFERENCE_PIP_REQS: str | None = None diff --git a/lumigator/python/mzai/schemas/lumigator_schemas/jobs.py b/lumigator/python/mzai/schemas/lumigator_schemas/jobs.py index bfe3428d9..06f9bc807 100644 --- a/lumigator/python/mzai/schemas/lumigator_schemas/jobs.py +++ b/lumigator/python/mzai/schemas/lumigator_schemas/jobs.py @@ -9,6 +9,7 @@ class JobType(str, Enum): INFERENCE = "inference" EVALUATION = "evaluate" + EVALUATION_LITE = "eval_lite" class JobStatus(str, Enum): @@ -66,6 +67,17 @@ class JobEvalCreate(BaseModel): skip_inference: bool = False +# TODO: this has to be renamed to JobEvalCreate and the code above +# has to be removed when we deprecate evaluator +class JobEvalLiteCreate(BaseModel): + name: str + description: str = "" + model: str + dataset: UUID + max_samples: int = -1 # set to all samples by default + config_template: str | None = None + + class JobInferenceCreate(BaseModel): name: str description: str = "" From 9ef897095d06e3c3f1ac90f9ad8e6c3e252eecfa Mon Sep 17 00:00:00 2001 From: Davide Eynard Date: Tue, 14 Jan 2025 14:56:48 +0000 Subject: [PATCH 4/5] Fixed eval-lite max_samples bug When max_samples was -1, a bug assigned the value -1 to max_samples (instead of the dataset size) creating an error in the following `range` command. This has been fixed by setting max_samples to len(dataset) whenever max_samples < 1 or > len(dataset). --- lumigator/python/mzai/jobs/evaluator_lite/eval_lite.py | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) diff --git a/lumigator/python/mzai/jobs/evaluator_lite/eval_lite.py b/lumigator/python/mzai/jobs/evaluator_lite/eval_lite.py index f16d7db7a..86b76cc23 100644 --- a/lumigator/python/mzai/jobs/evaluator_lite/eval_lite.py +++ b/lumigator/python/mzai/jobs/evaluator_lite/eval_lite.py @@ -61,11 +61,12 @@ def run_eval(config: EvalJobConfig) -> Path: # Load dataset given its URI dataset = load_from_disk(config.dataset.path) logger.info(f"Retrieving {config.dataset.path} for evaluation") - if max_samples: + + # Limit dataset length if max_samples is specified + if max_samples < 1 or max_samples > len(dataset): logger.info(f"max_samples ({max_samples}) resized to dataset size ({len(dataset)})") - # select data between the minimum and total length of dataset - num_samples = range(min(max_samples, len(dataset))) - dataset = dataset.select(num_samples) + max_samples = len(dataset) + dataset = dataset.select(range(max_samples)) # run evaluation and append to results dict predictions = dataset["predictions"] From d5a39fe14b85ef2317192d4cefba4fc37b966df7 Mon Sep 17 00:00:00 2001 From: Davide Eynard Date: Tue, 14 Jan 2025 18:50:09 +0000 Subject: [PATCH 5/5] Hardened eval_lite pydantic config --- lumigator/python/mzai/jobs/evaluator_lite/eval_config.py | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/lumigator/python/mzai/jobs/evaluator_lite/eval_config.py b/lumigator/python/mzai/jobs/evaluator_lite/eval_config.py index 76dd2815d..3d4b3c327 100644 --- a/lumigator/python/mzai/jobs/evaluator_lite/eval_config.py +++ b/lumigator/python/mzai/jobs/evaluator_lite/eval_config.py @@ -3,23 +3,25 @@ class DatasetConfig(BaseModel): path: str + model_config = ConfigDict(extra="forbid") class ModelConfig(BaseModel): path: str + model_config = ConfigDict(extra="forbid") class EvaluationConfig(BaseModel): metrics: list[str] = Field(default=["rouge", "meteor", "bertscore"]) - use_pipeline: bool = False max_samples: int = 0 return_input_data: bool = False return_predictions: bool = False - storage_path: str = "" + storage_path: str + model_config = ConfigDict(extra="forbid") class EvalJobConfig(BaseModel): - name: str | None + name: str dataset: DatasetConfig model: ModelConfig evaluation: EvaluationConfig